diff --git a/paddle/fluid/feed/pybind/pybind.cc b/paddle/fluid/feed/pybind/pybind.cc deleted file mode 100755 index a4cea8608d1d42069af9fb9047863f82251c60f7..0000000000000000000000000000000000000000 --- a/paddle/fluid/feed/pybind/pybind.cc +++ /dev/null @@ -1,1753 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include // NOLINT // for call_once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" -#include "paddle/fluid/framework/ir/pass_builder.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/framework/prune.h" -#include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/scope_pool.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/version.h" -#include "paddle/fluid/memory/allocation/allocator_strategy.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/py_func_op.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" -#include "paddle/fluid/platform/cpu_helper.h" -#include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/pybind/box_helper_py.h" -#include "paddle/fluid/pybind/const_value.h" -#include "paddle/fluid/pybind/data_set_py.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/pybind/fleet_wrapper_py.h" -#include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/pybind/inference_api.h" -#include "paddle/fluid/pybind/ir.h" -#include "paddle/fluid/pybind/expand_api.h" - -#ifndef _WIN32 -#include "paddle/fluid/pybind/nccl_wrapper_py.h" -#endif -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/pybind/protobuf.h" -#include "paddle/fluid/pybind/pybind.h" // NOLINT -#include "paddle/fluid/pybind/reader_py.h" -#include "paddle/fluid/pybind/tensor_py.h" -#include "paddle/fluid/string/to_string.h" -#ifdef PADDLE_WITH_CUDA -#ifndef _WIN32 -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#endif -#include "paddle/fluid/platform/cuda_profiler.h" -#include "paddle/fluid/platform/gpu_info.h" -#endif - -#ifdef PADDLE_WITH_DISTRIBUTE -#include "paddle/fluid/pybind/communicator_py.h" -#endif - -#include "pybind11/stl.h" - -DEFINE_bool(reader_queue_speed_test_mode, false, - "If set true, the queue.pop will only get data from queue but not " - "remove the data from queue for speed testing"); -DECLARE_bool(use_mkldnn); -#ifdef PADDLE_WITH_NGRAPH -DECLARE_bool(use_ngraph); -#endif - -// disable auto conversion to list in Python -PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); - -namespace paddle { -namespace pybind { -bool IsCompiledWithCUDA() { -#ifndef PADDLE_WITH_CUDA - return false; -#else - return true; -#endif -} - -bool IsCompiledWithMKLDNN() { -#ifndef PADDLE_WITH_MKLDNN - return false; -#else - return true; -#endif -} - -bool IsCompiledWithNGRAPH() { -#ifndef PADDLE_WITH_NGRAPH - return false; -#else - return true; -#endif -} - -bool IsCompiledWithBrpc() { -#ifndef PADDLE_WITH_DISTRIBUTE - return false; -#endif - -#ifdef PADDLE_WITH_GRPC - return false; -#endif - - return true; -} - -bool IsCompiledWithDIST() { -#ifdef PADDLE_WITH_DISTRIBUTE - return true; -#else - return false; -#endif -} - -template -static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { - return paddle::platform::Place(p1) == paddle::platform::Place(p2); -} - -template -static inline int PlaceIndex(const PlaceType &p) { - return static_cast(paddle::platform::Place(p).which()); -} - -#ifdef PADDLE_WITH_AVX -PYBIND11_MODULE(core_avx, m) { -#else -PYBIND11_MODULE(core_noavx, m) { -#endif - - // Not used, just make sure cpu_info.cc is linked. - paddle::platform::CpuTotalPhysicalMemory(); - - paddle::memory::allocation::UseAllocatorStrategyGFlag(); - - m.doc() = "C++ core of PaddlePaddle"; - - // using framework in this function. Since it is inside a function, it will - // not cause namespace pollution. - using namespace paddle::framework; // NOLINT - - BindException(&m); - - m.def("set_num_threads", &platform::SetNumThreads); - - m.def( - "_append_python_callable_object_and_return_id", - [](py::object py_obj) -> size_t { - return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); - }); - - m.def("_get_use_default_grad_op_desc_maker_ops", - [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); }); - - // NOTE(zjl): ctest would load environment variables at the beginning even - // though we have not `import paddle.fluid as fluid`. So we add this API - // to enable eager deletion mode in unittest. - m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode); - - m.def("_set_fuse_parameter_group_size", - &paddle::framework::ir::SetFuseParameterGroupsSize); - m.def("_set_fuse_parameter_memory_size", - &paddle::framework::ir::SetFuseParameterMemorySize); - - m.add_object("_cleanup", - py::capsule([]() { ScopePool::Instance().Clear(); })); - - m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath); - - BindImperative(&m); - - py::class_(m, "Tensor", py::buffer_protocol()) - .def("__array__", [](Tensor &self) { return TensorToPyArray(self); }) - .def("_is_initialized", - [](const Tensor &self) { return self.IsInitialized(); }) - .def("_get_dims", - [](const Tensor &self) { return vectorize(self.dims()); }) - .def("_set_dims", - [](Tensor &self, const std::vector &dim) { - self.Resize(make_ddim(dim)); - }) - .def("_set_layout", - [](Tensor &self, const std::string &layout) { - self.set_layout(StringToDataLayout(layout)); - }) - .def("_alloc_float", - [](Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_double", - [](Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_clear", &Tensor::clear) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) - .def("set", PyCPUTensorSetFromArray) -#ifdef PADDLE_WITH_CUDA - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDATensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) - .def("set", PyCUDAPinnedTensorSetFromArray) -#endif - .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) - .def("_set_float_element", TensorSetElement) - .def("_get_float_element", TensorGetElement) - .def("_set_double_element", TensorSetElement) - .def("_get_double_element", TensorGetElement) - .def("_place", [](Tensor &self) { return self.place(); }) - .def("_dtype", [](Tensor &self) { return self.type(); }) - .def("__getitem__", PySliceTensor, py::return_value_policy::reference) - .def("__str__", [](const Tensor &self) { - std::stringstream ostr; - ostr << self; - return ostr.str(); - }); - - py::class_(m, "LoDTensor", R"DOC( - LoDTensor is a Tensor with optional LoD information. - - np.array(lod_tensor) can convert LoDTensor to numpy array. - lod_tensor.lod() can retrieve the LoD information. - - LoD is short for Level of Details and is usually used for varied sequence - length. You can skip the following comment if you don't need optional LoD. - - For example, a LoDTensor X can look like the example below. It contains - 2 sequences. The first has length 2 and the second has length 3, as - described by x.lod. - - The first tensor dimension 5=2+3 is calculated from LoD if it's available. - It means the total number of sequence element. In X, each element has 2 - columns, hence [5, 2]. - - x.lod = [[2, 3]] - - x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] - - x.shape = [5, 2] - - LoD can have multiple levels (for example, a paragraph can have multiple - sentences and a sentence can have multiple words). In the following - LodTensor Y, the lod_level is 2. It means there are 2 sequence, the - first sequence length is 2 (has 2 sub-sequences), the second one's - length is 1. The first sequence's 2 sub-sequences have length 2 and 2, - respectively. And the second sequence's 1 sub-sequence has length 3. - - y.lod = [[2 1], [2 2 3]] - - y.shape = [2+2+3, ...] - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - t = fluid.LoDTensor() - - Note: - In above description, LoD is length-based. In Paddle internal - implementation, lod is offset-based. Hence, internally, - y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based - equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]). - - Sometimes LoD is called recursive_sequence_length to be more - self-explanatory. In this case, it must be length-based. Due to history - reasons. when LoD is called lod in public API, it might be offset-based. - Users should be careful about it. - )DOC") - .def("__array__", [](Tensor &self) { return TensorToPyArray(self); }) - .def("__init__", - [](LoDTensor &instance, const std::vector> - &recursive_sequence_lengths) { - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, -1), true, - "the provided recursive_sequence_lengths info is invalid"); - new (&instance) LoDTensor(new_offset_lod); - }) - .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) - // We implement offset based LOD in C++ while we use length based with - // Python API. So we changed set_lod to set_recursive_sequence_lengths to - // avoid misuse. - // The discussion is here: - // https://github.com/PaddlePaddle/Paddle/issues/10855 - .def("set_lod", - [](LoDTensor &self, const std::vector> &lod) { - // the input lod is offset-based level-of-detail info - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - PADDLE_ENFORCE_EQ( - CheckLoD(new_lod, vectorize(self.dims()).front()), true, - "the provided lod info is invalid"); - self.set_lod(new_lod); - }, - py::arg("lod"), R"DOC( - Set LoD of the LoDTensor. - - Args: - lod (List[List[int]]): the lod to be set. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - )DOC") - .def("set_recursive_sequence_lengths", - [](LoDTensor &self, const std::vector> - &recursive_sequence_lengths) { - // the input recursive_sequence_lengths is length-based - // level-of-detail info - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, - "the provided recursive_sequence_lengths info is invalid"); - self.set_lod(new_offset_lod); - }, - py::arg("recursive_sequence_lengths"), R"DOC( - Set LoD of the LoDTensor according to recursive sequence length. - - For example, if recursive_sequence_lengths=[[2, 3]], meaning that - there are two sequences with length 2 and 3 respectively, the - corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]]. - - Args: - recursive_sequence_lengths (List[List[int]]): sequence lengths. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - )DOC") - .def("lod", - [](LoDTensor &self) -> std::vector> { - // output the offset-based lod info - LoD lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the LoD of the LoDTensor. - - Returns: - out (List[List[int]]): the lod of the LoDTensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - print(t.lod()) # [[0, 2, 5]] - )DOC") - // Set above comments of set_lod. - .def("recursive_sequence_lengths", - [](LoDTensor &self) -> std::vector> { - // output the length-based lod info - LoD lod = ConvertToLengthBasedLoD(self.lod()); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the sequence length of the LoDTensor corresponding to LoD. - - Returns: - out (List[List[int]): the sequence lengths. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.recursive_sequence_lengths()) # [[2, 3]] - )DOC") - .def("has_valid_recursive_sequence_lengths", - [](LoDTensor &self) -> bool { - // Check that the lod info is valid and match the outermost - // dimension of the LoDTensor data - return CheckLoD(self.lod(), vectorize(self.dims()).front()); - }, - R"DOC( - Check whether the lod of the LoDTensor is valid. - - Returns: - out (bool): whether the lod is valid. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.has_valid_recursive_sequence_lengths()) # True - )DOC") - .def("__getitem__", PySliceTensor, py::return_value_policy::reference, - R"DOC( - Slice the original Tensor, and remove the LoD information. - - Returns: - out (Tensor): new Tensor(NOT LoDTensor). - )DOC") - .def("__str__", - [](const LoDTensor &self) { - std::stringstream ostr; - ostr << self; - return ostr.str(); - }) - .def("_copy", [](const LoDTensor &self, const platform::Place &place) { - // follow fetch_op's inplementation - LoDTensor dst; - if (self.IsInitialized() && self.numel() > 0) { - TensorCopySync(self, place, &dst); - } else { - // Not copy, if the src tensor is empty. - dst.clear(); - dst.Resize({0}); - } - dst.set_lod(self.lod()); - return dst; - }); - - py::class_(m, "SelectedRows") - .def("__init__", - [](SelectedRows &instance) { new (&instance) SelectedRows(); }) - .def("__init__", - [](SelectedRows &instance, const std::vector rows, - const int64_t &height) { - new (&instance) SelectedRows(rows, height); - }) - .def("get_tensor", - [](SelectedRows &self) { return self.mutable_value(); }, - py::return_value_policy::reference) - .def("numel", - [](SelectedRows &self) -> int64_t { return self.value().numel(); }) - .def("set_height", &SelectedRows::set_height) - .def("height", &SelectedRows::height) - .def("set_rows", - [](SelectedRows &self, std::vector rows) { -#ifndef PADDLE_WITH_CUDA - self.set_rows(rows); -#else - Vector new_rows(rows); - self.set_rows(new_rows); -#endif - }) - .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); }) - .def("rows", [](SelectedRows &self) { - auto rows = self.rows(); - std::vector new_rows; - new_rows.reserve(rows.size()); - std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); - return new_rows; - }); - - py::class_(m, "Variable", R"DOC(Variable Class. - -All parameter, weight, gradient are variables in Paddle. -)DOC") - .def(py::init<>()) - .def("is_int", [](const Variable &var) { return var.IsType(); }) - .def("set_int", - [](Variable &var, int val) -> void { *var.GetMutable() = val; }) - .def("get_int", [](const Variable &var) -> int { return var.Get(); }) - .def("is_float", [](const Variable &var) { return var.IsType(); }) - .def("set_float", - [](Variable &var, float val) -> void { - *var.GetMutable() = val; - }) - .def("get_float", - [](const Variable &var) -> float { return var.Get(); }) - .def("get_tensor", - [](Variable &self) -> LoDTensor * { - return self.GetMutable(); - }, - py::return_value_policy::reference) - .def("get_lod_rank_table", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) - .def("get_selected_rows", - [](Variable &self) -> SelectedRows * { - return self.GetMutable(); - }, - py::return_value_policy::reference) - .def("get_lod_tensor_array", - [](Variable &self) { return self.GetMutable(); }, - py::return_value_policy::reference) -#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) - .def("get_communicator", - [](Variable &self) -> platform::Communicator * { - return self.GetMutable(); - }, - py::return_value_policy::reference) -#endif - .def("get_reader", - [](Variable &self) -> framework::ReaderHolder * { - PADDLE_ENFORCE_EQ(self.IsType(), true); - return self.GetMutable(); - }, - py::return_value_policy::reference); - - BindReader(&m); - - using LoDTensorBlockingQueue = - ::paddle::operators::reader::LoDTensorBlockingQueue; - using LoDTensorBlockingQueueHolder = - ::paddle::operators::reader::LoDTensorBlockingQueueHolder; - - py::class_>( - m, "LoDTensorBlockingQueue", "") - .def("push", - [](LoDTensorBlockingQueue &self, - const std::vector &lod_tensor_vec) { - pybind11::gil_scoped_release release; - return self.Push(lod_tensor_vec); - }) - .def("size", &LoDTensorBlockingQueue::Size) - .def("capacity", &LoDTensorBlockingQueue::Cap) - .def("close", &LoDTensorBlockingQueue::Close) - .def("is_closed", &LoDTensorBlockingQueue::IsClosed); - - m.def("init_lod_tensor_blocking_queue", - [](Variable &var, - size_t capacity) -> std::shared_ptr { - VLOG(1) << "init_lod_tensor_blocking_queue"; - auto *holder = var.GetMutable(); - holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); - return holder->GetQueue(); - }, - py::return_value_policy::copy); - - py::class_(m, "_Scope", R"DOC( - Scope is an association of a name to Variable. All variables belong to Scope. - - Variables in a parent scope can be retrieved from local scope. - - You need to specify a scope to run a Net, i.e., `exe.Run(&scope)`. - One net can run in different scopes and update different variable in the - scope. - - You can create var in a scope and get it from the scope. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - # create tensor from a scope and set value to it. - param = scope.var('Param').get_tensor() - param_array = np.full((height, row_numel), 5.0).astype("float32") - param.set(param_array, place) - - )DOC") - .def("_remove_from_pool", - [](Scope &self) { ScopePool::Instance().Remove(&self); }) - .def("var", - [](Scope &self, const std::string &name) -> Variable * { - return self.Var(name); - }, - py::arg("name"), - R"DOC( - Find or create variable named :code:`name` in the current scope. - - If the variable named :code:`name` does not exist in the - current scope, the variable would be created. Otherwise, - return the existing variable. - - Args: - name (str): the variable name. - - Returns: - out (core.Variable): the found or created variable. - )DOC", - py::return_value_policy::reference) - .def("find_var", &Scope::FindVar, py::arg("name"), - R"DOC( - Find variable named :code:`name` in the current scope or - its parent scope. Return None if not found. - - Args: - name (str): the variable name. - - Returns: - out (core.Variable|None): the found variable or None. - )DOC", - py::return_value_policy::reference) - .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, - R"DOC( - Create a new sub-scope of the current scope. - - Returns: - out (core._Scope): the created sub-scope. - )DOC", - py::return_value_policy::reference) - .def("drop_kids", &Scope::DropKids, - R"DOC( - Delete all sub-scopes of the current scope. - )DOC") - .def("_kids", &Scope::kids); - - m.def("Scope", - []() -> Scope * { - auto *s = new Scope(); - ScopePool::Instance().Insert(std::unique_ptr(s)); - return s; - }, - R"DOC( - Create a new scope. - - Returns: - out (core._Scope): the created scope. - )DOC", - py::return_value_policy::reference); - - //! @note: Be careful! PyBind will return std::string as an unicode, not - //! Python str. If you want a str object, you should cast them in Python. - m.def("get_all_op_protos", []() -> std::vector { - std::vector ret_values; - for (auto &iter : OpInfoMap::Instance().map()) { - auto &info = iter.second; - if (info.HasOpProtoAndChecker()) { - std::string str; - PADDLE_ENFORCE_EQ( - info.Proto().SerializeToString(&str), true, - "Serialize OpProto Error. This could be a bug of Paddle."); - ret_values.emplace_back(str); - } - } - return ret_values; - }); - m.def( - "get_grad_op_desc", [](const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - const std::vector &grad_sub_block) { - std::unordered_map grad_to_var; - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, &grad_to_var, - grad_sub_block); - std::vector grad_op_desc_ptrs(grad_op_descs.size()); - std::transform(grad_op_descs.begin(), grad_op_descs.end(), - grad_op_desc_ptrs.begin(), - [](std::unique_ptr &p) { return p.release(); }); - return std::make_pair(grad_op_desc_ptrs, grad_to_var); - }); - m.def("has_grad_op_maker", [](const std::string op_type) { - return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker(); - }); - m.def("has_infer_inplace", [](const std::string op_type) { - return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace(); - }); - m.def("get_flags_use_mkldnn", []() { return FLAGS_use_mkldnn; }); -#ifdef PADDLE_WITH_NGRAPH - m.def("get_flags_use_ngraph", []() { return FLAGS_use_ngraph; }); -#endif - - m.def("prune", [](const ProgramDesc &origin, - const std::set &feeded_var_names, - const std::vector> &targets) { - ProgramDesc prog_with_targets(origin); - - for (const auto &t : targets) { - prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true); - } - proto::ProgramDesc pruned_desc; - Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc); - return new ProgramDesc(pruned_desc); - }); - m.def("prune_backward", [](const framework::ProgramDesc &program) { - return PruneBackward(program); - }); - m.def("empty_var_name", - []() { return std::string(framework::kEmptyVarName); }); - m.def("grad_var_suffix", - []() { return std::string(framework::kGradVarSuffix); }); - m.def_submodule( - "var_names", - "The module will return special predefined variable name in Paddle") - .def("empty", []() { return kEmptyVarName; }) - .def("temp", []() { return kTempVarName; }); - // clang-format off - py::class_(m, "DeviceContext") - .def_static("create", - [](paddle::platform::CPUPlace& place) - -> paddle::platform::DeviceContext* { - return new paddle::platform::CPUDeviceContext(); - }) - .def_static("create", - [](paddle::platform::CUDAPlace& place) - -> paddle::platform::DeviceContext* { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW("CUDAPlace is not supported in CPU device."); -#else - return new paddle::platform::CUDADeviceContext(place); -#endif - }) - .def_static("create", - [](paddle::platform::CUDAPinnedPlace& place) - -> paddle::platform::DeviceContext* { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW( - "CUDAPinnedPlace is not supported in CPU device."); -#else - return new paddle::platform::CUDAPinnedDeviceContext(place); -#endif - });; -// clang-format on -#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) - py::class_(m, "Communicator").def(py::init<>()); -#endif - py::class_(m, "CUDAPlace", R"DOC( - CUDAPlace is a descriptor of a device. It represents a GPU, and each CUDAPlace - has a dev_id to indicate the number of cards represented by the current CUDAPlace. - The memory of CUDAPlace with different dev_id is not accessible. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - gpu_place = fluid.CUDAPlace(0) - - )DOC") - .def("__init__", - [](platform::CUDAPlace &self, int dev_id) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - - if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) { - if (platform::GetCUDADeviceCount() == 0) { - LOG(ERROR) << "Cannot use GPU because there is no GPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " - "number on your machine is %d", - dev_id, platform::GetCUDADeviceCount(), - platform::GetCUDADeviceCount()); - std::exit(-1); - } - } - - new (&self) platform::CUDAPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use GPU because you have installed CPU version " - "PaddlePaddle.\n" - "If you want to use GPU, please try to install GPU version " - "PaddlePaddle by: pip install paddlepaddle-gpu\n" - "If you only have CPU, please change CUDAPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__str__", string::to_string); - - py::class_(m, "CPUPlace", R"DOC( - CPUPlace is a descriptor of a device. It represents a CPU, and the memory - CPUPlace can be accessed by CPU. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - cpu_place = fluid.CPUPlace() - - )DOC") - .def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__str__", string::to_string); - - py::class_(m, "CUDAPinnedPlace", R"DOC( - CUDAPinnedPlace is a descriptor of a device. The memory of CUDAPinnedPlace - can be accessed by GPU and CPU. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - place = fluid.CUDAPinnedPlace() - - )DOC") - .def("__init__", - [](platform::CUDAPinnedPlace &self) { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); -#endif - new (&self) platform::CUDAPinnedPlace(); - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__str__", string::to_string); - - py::class_(m, "Place") - .def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("is_gpu_place", - [](platform::Place &self) { return platform::is_gpu_place(self); }) - .def("is_cpu_place", - [](platform::Place &self) { return platform::is_cpu_place(self); }) - .def("is_cuda_pinned_place", - [](platform::Place &self) { - return platform::is_cuda_pinned_place(self); - }) - .def("gpu_device_id", - [](platform::Place &self) { - return boost::get(self).device; - }) - .def("set_place", [](platform::Place &self, - const platform::Place &other) { self = other; }) - .def("set_place", - [](platform::Place &self, const platform::CPUPlace &cpu_place) { - self = cpu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::CUDAPlace &gpu_place) { - self = gpu_place; - }) - .def("set_place", [](platform::Place &self, - const platform::CUDAPinnedPlace &cuda_pinned_place) { - self = cuda_pinned_place; - }); - - py::class_(m, "Operator") - .def_static( - "create", - [](py::bytes protobin) { - proto::OpDesc desc; - PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true, - "Cannot parse user input to OpDesc"); - PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, - "User OpDesc is not initialized, reason %s", - desc.InitializationErrorString()); - return OpRegistry::CreateOp(desc); - }) - .def("run", - [](OperatorBase &self, const Scope &scope, - const platform::CPUPlace &place) { self.Run(scope, place); }) - .def("run", - [](OperatorBase &self, const Scope &scope, - const platform::CUDAPlace &place) { self.Run(scope, place); }) - .def("run", - [](OperatorBase &self, const Scope &scope, - const platform::CUDAPinnedPlace &place) { - self.Run(scope, place); - }) - .def("type", - [](const OperatorBase &op) -> std::string { return op.Type(); }) - .def("outputs", - [](const OperatorBase &op) - -> std::map> { - return op.Outputs(); - }) - .def("output_vars", - [](const OperatorBase &op) { return op.OutputVars(true); }) - .def("inputs", [](const OperatorBase &op) { return op.Inputs(); }) - .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); }) - .def("__str__", &OperatorBase::DebugString) - .def("no_intermediate_outputs", - [](const OperatorBase &op) { return op.OutputVars(false); }) - .def("support_gpu", &OperatorBase::SupportGPU); - - py::class_(m, "ExecutorPrepareContext") - .def(py::init()); - - py::class_(m, "Executor") - .def(py::init()) - .def("close", &Executor::Close) - .def("run_from_dataset", &Executor::RunFromDataset, - py::call_guard()) - .def("run_prepared_ctx", - [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, - std::map *feed_targets, - std::map *fetch_targets, - bool create_local_scope = true, bool create_vars = true, - const std::string &feed_holder_name = "feed", - const std::string &fetch_holder_name = "fetch") { - pybind11::gil_scoped_release release; - self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets, - create_local_scope, create_vars, - feed_holder_name, fetch_holder_name); - }) - .def("run_cached_prepared_ctx", - [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, - bool create_local_scope = true, bool create_vars = true, - bool keep_kids = false) { - pybind11::gil_scoped_release release; - self.RunPreparedContext(ctx, scope, create_local_scope, - create_vars, keep_kids); - }) - .def("prepare_ctx_cache", &Executor::PrepareCtxCache, - py::call_guard()) - .def("create_variables", &Executor::CreateVariables, - py::call_guard()) - .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, - int block_id, bool create_local_scope, bool create_vars, - const std::vector &fetch_vars) { - pybind11::gil_scoped_release release; - self.Run(prog, scope, block_id, create_local_scope, create_vars, - fetch_vars); - }); - - m.def("init_gflags", framework::InitGflags); - m.def("init_glog", framework::InitGLOG); - m.def("init_dgc", framework::InitDGC); - m.def("init_devices", - [](bool init_p2p) { framework::InitDevices(init_p2p); }); - - m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH); - m.def("is_compiled_with_cuda", IsCompiledWithCUDA); - m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); - m.def("is_compiled_with_brpc", IsCompiledWithBrpc); - m.def("is_compiled_with_dist", IsCompiledWithDIST); -#ifdef PADDLE_WITH_CUDA - m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { - // Only GPUs with Compute Capability >= 53 support float16 - return platform::GetCUDAComputeCapability(place.device) >= 53; - }); -#endif - - m.def("set_feed_variable", framework::SetFeedVariable); - m.def("get_fetch_variable", framework::GetFetchVariable); - m.def("get_variable_tensor", framework::GetVariableTensor); - - m.def("_is_program_version_supported", IsProgramVersionSupported); - - BindProgramDesc(&m); - BindBlockDesc(&m); - BindVarDsec(&m); - BindOpDesc(&m); - BindConstValue(&m); - - py::class_(m, "LodRankTable") - .def("items", [](framework::LoDRankTable &table) { - std::vector> res; - for (auto &item : table.items()) { - res.push_back({item.index, item.length}); - } - return res; - }); - - py::class_(m, "LoDTensorArray", R"DOC( - Array of LoDTensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - arr = fluid.LoDTensorArray() -)DOC") - .def("__init__", - [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); }) - .def("__getitem__", - [](LoDTensorArray &self, size_t i) { return &self.at(i); }, - py::return_value_policy::reference) - .def("__len__", [](LoDTensorArray &self) { return self.size(); }) - .def("__setitem__", - [](LoDTensorArray &self, size_t i, const LoDTensor &t) { - PADDLE_ENFORCE_LT(i, self.size()); - self[i].ShareDataWith(t); - self[i].set_lod(t.lod()); - }) - .def("append", - [](LoDTensorArray &self, const LoDTensor &t) { - self.emplace_back(); - self.back().ShareDataWith(t); - self.back().set_lod(t.lod()); - }, - py::arg("tensor"), R"DOC( - Append a LoDensor to LoDTensorArray. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - arr = fluid.LoDTensorArray() - t = fluid.LoDTensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - arr.append(t) - )DOC") - .def("_move_to_list", - [](LoDTensorArray &self) -> py::list { - py::list res(self.size()); - for (size_t i = 0; i < self.size(); ++i) { - res[i] = py::cast(std::move(self[i])); - } - self.clear(); - return res; - }, - py::return_value_policy::take_ownership); - - m.def("op_support_gpu", OpSupportGPU); -#ifdef PADDLE_WITH_CUDA - m.def("get_cuda_device_count", platform::GetCUDADeviceCount); - -#ifndef _WIN32 - m.def("nvprof_init", platform::CudaProfilerInit); - m.def("nvprof_start", platform::CudaProfilerStart); - m.def("nvprof_stop", platform::CudaProfilerStop); -#endif -#endif - - py::enum_(m, "ProfilerState", py::arithmetic()) - .value("kDisabled", platform::ProfilerState::kDisabled) - .value("kCPU", platform::ProfilerState::kCPU) - .value("kCUDA", platform::ProfilerState::kCUDA) - .value("kAll", platform::ProfilerState::kAll) - .export_values(); - - py::enum_(m, "EventSortingKey", py::arithmetic()) - .value("kDefault", platform::EventSortingKey::kDefault) - .value("kCalls", platform::EventSortingKey::kCalls) - .value("kTotal", platform::EventSortingKey::kTotal) - .value("kMin", platform::EventSortingKey::kMin) - .value("kMax", platform::EventSortingKey::kMax) - .value("kAve", platform::EventSortingKey::kAve) - .export_values(); - - m.def("enable_profiler", platform::EnableProfiler); - m.def("disable_profiler", platform::DisableProfiler); - m.def("is_profiler_enabled", platform::IsProfileEnabled); - m.def("reset_profiler", platform::ResetProfiler); - m.def("get_pass", [](const std::string &pass_type) { - auto pass = framework::ir::PassRegistry::Instance().Get(pass_type); - return std::shared_ptr(std::move(pass)); - }); - - m.def("size_of_dtype", framework::SizeOfType); - - using VarQuantScale = - std::unordered_map>; - - py::class_> pass(m, "Pass"); - pass.def(py::init()) - .def("has", &ir::Pass::Has) - .def("set_not_owned", - [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { - self.SetNotOwned(attr_name, &attr); - }) - .def( - "set", - [](ir::Pass &self, const std::string &name, const std::string &attr) { - self.Set(name, new std::string(attr)); - }) - .def("set", [](ir::Pass &self, const std::string &name, - int val) { self.Set(name, new int(val)); }) - .def("set", - [](ir::Pass &self, const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, VarQuantScale scales) { - self.Set(name, new VarQuantScale(scales)); - }) - .def("type", &ir::Pass::Type) - .def("apply", [](ir::Pass &self, std::shared_ptr graph) { - self.Apply(graph.get()); - }); - - py::class_> pb( - m, "PassBuilder"); - pb.def(py::init()) - .def("append_pass", - [](ir::PassBuilder &self, - const std::string &pass_type) -> std::shared_ptr { - return self.AppendPass(pass_type); - }) - .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) - .def("insert_pass", - [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { - return self.InsertPass(idx, pass_type); - }) - .def("remove_pass", - [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); - - // -- python binds for parallel executor. - - py::class_ pe(m, "ParallelExecutor"); - py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( - ExecutionStrategy allows the user to more preciously control how to run - the program in ParallelExecutor by setting the property. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(cost) - - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_loss) - - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.num_threads = 4 - - train_exe = fluid.ParallelExecutor(use_cuda=False, - loss_name=avg_loss.name, - exec_strategy=exec_strategy) - - )DOC"); - - exec_strategy.def(py::init()) - .def_property( - "num_threads", - [](const ExecutionStrategy &self) { return self.num_threads_; }, - [](ExecutionStrategy &self, size_t num_threads) { - self.num_threads_ = num_threads; - }, - R"DOC(The type is INT, num_threads represents the size of thread pool that - used to run the operators of the current program in ParallelExecutor. - If :math:`num\_threads=1`, all the operators will execute one by one, - but the order maybe difference between iterations. - If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, - :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. - if it is not set, ParallelExecutor will get the cpu count by calling - `multiprocessing.cpu_count()`. Default 0.)DOC") - .def_property( - "use_cuda", - [](const ExecutionStrategy &self) { return self.use_cuda_; }, - [](ExecutionStrategy &self, bool use_cuda) { - self.use_cuda_ = use_cuda; - }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may - // make user confuse, because ParallelExecutor has a parameter named - // 'use_cuda' too, in current implementation, ParallelExecutor's - // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'. - .def_property( - "allow_op_delay", - [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, - [](ExecutionStrategy &self, bool allow_op_delay) { - self.allow_op_delay_ = allow_op_delay; - }, - R"DOC(The type is BOOL, allow_op_delay represents whether to delay the - communication operators to run, it may make the execution faster. - Note that this option is invalid now, and it will be removed in - next version. Default False.)DOC") - .def_property( - "num_iteration_per_drop_scope", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_drop_scope_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { - self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }, - R"DOC(The type is INT, num_iteration_per_drop_scope indicates how - many iterations to clean up the temp variables which - is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. - Default 1. - - NOTES: - 1. If you fetch data when calling the 'run', the ParallelExecutor - will clean up the temp variables at the end of the current iteration. - 2. In some NLP model, it may cause the GPU memory is insufficient, - in this case, you should reduce `num_iteration_per_drop_scope`. - )DOC") - .def_property( - "num_iteration_per_run", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_run_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_run) { - self.num_iteration_per_run_ = num_iteration_per_run; - }, - R"DOC(This config that how many iteration the executor will run when - user call pe.run() in python - )DOC") - .def_property("_dry_run", - [](const ExecutionStrategy &self) { return self.dry_run_; }, - [](ExecutionStrategy &self, bool dry_run) { - self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); - - py::class_ build_strategy(pe, "BuildStrategy", R"DOC( - BuildStrategy allows the user to more preciously control how to - build the SSA Graph in ParallelExecutor by setting the property. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce -)DOC"); - - py::enum_(build_strategy, "ReduceStrategy") - .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) - .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce); - py::enum_(build_strategy, - "GradientScaleStrategy") - .value("CoeffNumDevice", - BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) - .value("One", BuildStrategy::GradientScaleStrategy::kOne) - .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); - - build_strategy.def(py::init()) - .def_property( - "reduce_strategy", - [](const BuildStrategy &self) { return self.reduce_; }, - [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.reduce_ = strategy; - }, - R"DOC(The type is fluid.BuildStrategy.ReduceStrategy, there are two reduce - strategies in ParallelExecutor, AllReduce and Reduce. If you want - that all the parameters' optimization are done on all devices independently, - you should choose AllReduce; if you choose Reduce, all the parameters' - optimization will be evenly distributed to different devices, and then - broadcast the optimized parameter to other devices. - Default 'AllReduce'. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - )DOC") - .def_property( - "gradient_scale_strategy", - [](const BuildStrategy &self) { return self.gradient_scale_; }, - [](BuildStrategy &self, - BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finalized."); - self.gradient_scale_ = strategy; - }, - R"DOC(The type is fluid.BuildStrategy.GradientScaleStrategy, there are three - ways of defining :math:`loss@grad` in ParallelExecutor, CoeffNumDevice, - One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` - according to the number of devices. If you want to customize :math:`loss@grad`, - you can choose Customized. Default 'CoeffNumDevice'. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.compiler as compiler - import numpy - import os - - use_cuda = True - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - - # NOTE: If you use CPU to run the program, you need - # to specify the CPU_NUM, otherwise, fluid will use - # all the number of the logic core as the CPU_NUM, - # in that case, the batch size of the input should be - # greater than CPU_NUM, if not, the process will be - # failed by an exception. - if not use_cuda: - os.environ['CPU_NUM'] = str(2) - places = fluid.cpu_places() - else: - places = places = fluid.cuda_places() - - data = fluid.layers.data(name='X', shape=[1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) - - fluid.default_startup_program().random_seed=1 - exe.run(fluid.default_startup_program()) - - build_strategy = fluid.BuildStrategy() - build_strategy.gradient_scale_strategy = \ - fluid.BuildStrategy.GradientScaleStrategy.Customized - compiled_prog = compiler.CompiledProgram( - fluid.default_main_program()).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy, - places = places) - - dev_count = len(places) - x = numpy.random.random(size=(10, 1)).astype('float32') - loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 - loss_grad_name = loss.name+"@GRAD" - loss_data = exe.run(compiled_prog, - feed={"X": x, loss_grad_name : loss_grad}, - fetch_list=[loss.name, loss_grad_name]) - )DOC") - .def_property( - "debug_graphviz_path", - [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, - [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.debug_graphviz_path_ = path; - }, - R"DOC(The type is STR, debug_graphviz_path indicates the path that - writing the SSA Graph to file in the form of graphviz. - It is useful for debugging. Default "" - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "./graph" - - )DOC") - .def_property( - "enable_sequential_execution", - [](const BuildStrategy &self) { - return self.enable_sequential_execution_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.enable_sequential_execution_ = b; - }, - R"DOC(The type is BOOL. If set True, the execution order of ops would - be the same as what is in the program. Default False. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.enable_sequential_execution = True - )DOC") - .def_property( - "remove_unnecessary_lock", - [](const BuildStrategy &self) { - return self.remove_unnecessary_lock_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.remove_unnecessary_lock_ = b; - }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be - released and ParallelExecutor would run faster. Default True. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.remove_unnecessary_lock = True - )DOC") - .def_property( - "num_trainers", - [](const BuildStrategy &self) { return self.num_trainers_; }, - [](BuildStrategy &self, int num_trainers) { -#ifdef WIN32 - PADDLE_THROW("Windows has NO support to distribute mode."); -#endif - self.num_trainers_ = num_trainers; - }) - .def_property( - "trainers_endpoints", - [](const BuildStrategy &self) { return self.trainers_endpoints_; }, - [](BuildStrategy &self, - const std::vector &trainers_endpoints) { - self.trainers_endpoints_ = trainers_endpoints; - }) - .def_property("trainer_id", - [](const BuildStrategy &self) { return self.trainer_id_; }, - [](BuildStrategy &self, int trainer_id) { - self.trainer_id_ = trainer_id; - }) - .def_property( - "nccl_comm_num", - [](const BuildStrategy &self) { return self.nccl_comm_num_; }, - [](BuildStrategy &self, int nccl_comm_num) { - self.nccl_comm_num_ = nccl_comm_num; - }) - .def_property("use_hierarchical_allreduce", - [](const BuildStrategy &self) { - return self.use_hierarchical_allreduce_; - }, - [](BuildStrategy &self, bool use) { - self.use_hierarchical_allreduce_ = use; - }) - .def_property("hierarchical_allreduce_inter_nranks", - [](const BuildStrategy &self) { - return self.hierarchical_allreduce_inter_nranks_; - }, - [](BuildStrategy &self, int nranks) { - self.hierarchical_allreduce_inter_nranks_ = nranks; - }) - - .def_property( - "fuse_elewise_add_act_ops", - [](const BuildStrategy &self) { - return self.fuse_elewise_add_act_ops_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.fuse_elewise_add_act_ops_ = b; - }, - R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether - to fuse elementwise_add_op and activation_op, - it may make the execution faster. Default False - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.fuse_elewise_add_act_ops = True - )DOC") - .def_property( - "fuse_relu_depthwise_conv", - [](const BuildStrategy &self) { - return self.fuse_relu_depthwise_conv_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.fuse_relu_depthwise_conv_ = b; - }, - R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether - to fuse relu and depthwise_conv2d, - it will save GPU memory and may make the execution faster. - This options is only available in GPU devices. - Default False. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.fuse_relu_depthwise_conv = True - )DOC") - .def_property("fuse_broadcast_ops", - [](const BuildStrategy &self) { - return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == boost::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.fuse_broadcast_ops_ = b; - }, - R"DOC(The type is BOOL, fuse_broadcast_op indicates whether - to fuse the broadcast ops. Note that, in Reduce mode, - fusing broadcast ops may make the program faster. Because - fusing broadcast OP equals delaying the execution of all - broadcast Ops, in this case, all nccl streams are used only - for NCCLReduce operations for a period of time. Default False.)DOC") - .def_property("fuse_all_optimizer_ops", - [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == boost::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.fuse_all_optimizer_ops_ = b; - }) - .def_property( - "sync_batch_norm", - [](const BuildStrategy &self) { return self.sync_batch_norm_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, - "BuildStrategy is finlaized."); - self.sync_batch_norm_ = b; - }, - R"DOC(The type is BOOL, sync_batch_norm indicates whether to use - synchronous batch normalization which synchronizes the mean - and variance through multi-devices in training phase. - - Current implementation doesn't support FP16 training and CPU. - And only synchronous on one machine, not all machines. - - Default False - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.sync_batch_norm = True - )DOC") - .def_property( - "memory_optimize", - [](const BuildStrategy &self) -> py::object { - if (self.memory_optimize_) { - return py::cast(self.memory_optimize_.get()); - } else { - return py::cast(nullptr); - } - }, - [](BuildStrategy &self, const py::handle &value) { - auto *py_obj = value.ptr(); - if (py_obj == nullptr || py_obj == Py_None) { - self.memory_optimize_ = boost::none; - } else if (PyBool_Check(py_obj)) { - self.memory_optimize_ = (py_obj == Py_True); - } else { - PADDLE_THROW( - "BuildStrategy.memory_optimize must be None, False or True"); - } - }, - R"DOC(The type is BOOL or None, memory opitimize aims to save total memory - consumption, set to True to enable it. - - Default None. None means framework would choose to use or not use - this strategy automatically. Currently, None means that it is - enabled when GC is disabled, and disabled when GC is enabled. - True means enabling and False means disabling. Default None.)DOC") - .def_property( - "is_distribution", - [](const BuildStrategy &self) { return self.is_distribution_; }, - [](BuildStrategy &self, bool b) { -#ifdef WIN32 - if (b) { - PADDLE_THROW("Windows has NO support to distribute mode."); - } -#else - self.is_distribution_ = b; -#endif - }) - .def_property("async_mode", - [](const BuildStrategy &self) { return self.async_mode_; }, - [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) - .def_property( - "enable_inplace", - [](const BuildStrategy &self) { return self.enable_inplace_; }, - [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property( - "fuse_all_reduce_ops", - [](const BuildStrategy &self) { - return self.fuse_all_reduce_ops_ == true || - self.fuse_all_reduce_ops_ == boost::none; - }, - [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) - .def_property("enable_backward_optimizer_op_deps", - [](const BuildStrategy &self) { - return self.enable_backward_optimizer_op_deps_; - }, - [](BuildStrategy &self, bool b) { - self.enable_backward_optimizer_op_deps_ = b; - }) - .def_property( - "cache_runtime_context", - [](const BuildStrategy &self) { return self.cache_runtime_context_; }, - [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) - .def_property( - "mkldnn_enabled_op_types", - [](const BuildStrategy &self) { - return self.mkldnn_enabled_op_types_; - }, - [](BuildStrategy &self, - const std::unordered_set &mkldnn_enabled_op_types) { - self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; - }) - .def("_finalize_strategy_and_create_passes", - [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(true); - }, - R"DOC(Allow user to customized passes. Normally model-specific - optimization passes should be defined in this way. BuildStrategy - cannot be updated after being finalized.)DOC"); - - pe.def(py::init &, - const std::vector &, const std::string &, - Scope *, std::vector &, const ExecutionStrategy &, - const BuildStrategy &, ir::Graph *>()) - // NOTE: even we return a vec* to Python use reference policy. - // We still cannot get local_scope from this vector, since the element - // of vec will be freed by Python GC. We can only return Scope* - // one by one and mark them as reference. - .def("local_scopes", - [](ParallelExecutor &self) -> std::vector * { - return &self.GetLocalScopes(); - }, - py::return_value_policy::reference) - .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) - .def("_need_create_local_exe_scopes", - &ParallelExecutor::NeedCreateLocalExeScope) - .def("feed_tensors_into_local_scopes", - &ParallelExecutor::FeedTensorsIntoLocalScopes) - .def("feed_and_split_tensor_into_local_scopes", - &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) - .def("run", [](ParallelExecutor &self, - const std::vector &fetch_tensors) { - pybind11::gil_scoped_release release; - return self.Run(fetch_tensors); - }); - - BindFleetWrapper(&m); - BindBoxHelper(&m); -#ifndef _WIN32 - BindNCCLWrapper(&m); -#endif - BindGraph(&m); - BindNode(&m); - BindInferenceApi(&m); - BindExpandApi(&m); - BindDataset(&m); -#ifdef PADDLE_WITH_DISTRIBUTE - BindCommunicator(&m); -#endif -} -} // namespace pybind -} // namespace paddle