diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 7c4a79967bebf888b5b9ef679d9da70e25e7fc12..2cc40b7bcd8c59feb811d0b9204f323cab100d92 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -53,6 +53,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("fuse_relu_depthwise_conv_pass"); } + // NOTE(dzhwinter): A note for automatical inplace. + // 1. modify program desc passes should put + // before inplace pass. + // 2. manually configured inplace should put + // before inplace_pass + // Add automatically inplace. if (strategy_.enable_inplace_) { AppendPass("inplace_pass"); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 649b129161bd416bc77549c801dc2524229d1f9b..e3e06a5614ddee0bea342bc3608691b7a32326cc 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -80,6 +80,9 @@ struct BuildStrategy { bool memory_early_delete_{false}; + // TODO(dzhwinter): + // make enable_inplace, memory_optimize_ + // memory_early_delete_ true by default bool enable_inplace_{false}; bool enable_sequential_execution_{false}; diff --git a/paddle/fluid/framework/details/graph_print_pass.h b/paddle/fluid/framework/details/graph_print_pass.h index 5ff98609ce2507a4fe0758caa07bfaebe866e4bd..ab506abbabb5c621ab21c71f26724c12f0f0d14f 100644 --- a/paddle/fluid/framework/details/graph_print_pass.h +++ b/paddle/fluid/framework/details/graph_print_pass.h @@ -26,6 +26,11 @@ namespace details { constexpr char kGraphvizPath[] = "debug_graphviz_path"; constexpr char kGraphviz[] = "graphviz"; +// NOTE(dzhwinter): If the graph contains circles. +// the graph can not be topology sort. +// This printer will print the whole graph +// and highlight the circles. It's quite useful +// for debug the deadlock and circles. class GraphvizNode { public: GraphvizNode(ir::Node* n, const int& i) : node_(n), id_(i) {} @@ -37,7 +42,7 @@ class GraphvizNode { ir::Node* node_; int id_; }; -class GraphvizNode; + typedef std::unordered_set> GraphvizNodes; class SSAGraphPrinter { diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index 55bac90a8dae326d5c014dc4372938c8e4f64532..b56ef021ef508a43aac082acbcfa6f543635203e 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include +#include #include #include @@ -21,15 +23,17 @@ namespace paddle { namespace framework { namespace details { +size_t NodeSizeInBytes(const VarDesc& node) { + auto shape = node.GetShape(); + int size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + size_t type_size = SizeOfType(node.GetDataType()); + return type_size * std::abs(size); +} + size_t NodeSizeInBytes(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - auto shape = desc->GetShape(); - size_t type_size = SizeOfType(desc->GetDataType()); - int size = 1; - for (auto& s : shape) { - size *= s; - } - return type_size * std::abs(size); + return NodeSizeInBytes(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -154,23 +158,28 @@ std::string OrderedNodeList::ToString() const { bool NodeCanReused(ir::Node* node) { if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; - auto* desc = node->Var(); - auto type = desc->GetType(); - if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || - desc->GetShape().empty()) { - return false; - } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node->Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; + // auto* desc = node->Var(); + bool flag = NodeCanReused(*node->Var()); for (auto* op : node->inputs) { if (op->Op()->HasAttr("force_cpu")) { // op output force generated in cpu, can not be reused. - return framework::AttrReader(op->Op()->GetAttrMap()) - .Get("force_cpu") == 0; + flag &= framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; } } + return flag; +} + +bool NodeCanReused(const VarDesc& node) { + auto type = node.GetType(); + if (node.Persistable() || type != proto::VarType::LOD_TENSOR || + node.GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node.Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; return true; } diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 02f896325204f3de2bb20622fe87a988caf7f0d8..064183d61ea7386b6b45034c90fd7569a8647f60 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -86,12 +86,18 @@ class OrderedNodeList { // valid a tensor can be reuse or not bool NodeCanReused(ir::Node* node); +// valid a tensor can be reuse or not. +bool NodeCanReused(const VarDesc& node); + // check op has subblock or not bool OpHasSubBlock(OpDesc* desc); // node memory size in bytes size_t NodeSizeInBytes(ir::Node* n); +// node memory size in bytes +size_t NodeSizeInBytes(const VarDesc&); + std::string DebugString(ir::Node* var); VarDesc* FindVarDescInBlock(ir::Node* n); diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index fe28c7ed2e522a1f9d027c916ab02285ead64baa..03ab2a2b6c5dc07805fddddc3ac53f61e7b6a697 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -19,6 +19,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" @@ -66,30 +67,9 @@ class InplaceInToOut : public InplaceOpInference { const OpDesc& op_desc, BlockDesc* block) const = 0; bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { - auto var_can_reused = [&](const VarDesc& node) -> bool { - auto type = node.GetType(); - if (node.Persistable() || type != proto::VarType::LOD_TENSOR || - node.GetShape().empty()) { - return false; - } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node.Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; - return true; - }; - - auto var_size_in_bytes = [&](const VarDesc& node) -> size_t { - auto shape = node.GetShape(); - int size = std::accumulate(shape.begin(), shape.end(), 1, - std::multiplies()); - size_t type_size = SizeOfType(node.GetDataType()); - return type_size * std::abs(size); - }; - - return in.Name() != out.Name() && var_can_reused(in) && - var_can_reused(out) && - var_size_in_bytes(out) <= var_size_in_bytes(in); + return in.Name() != out.Name() && details::NodeCanReused(in) && + details::NodeCanReused(out) && + details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); } }; diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index a35a4c59835e2a64a11ae156bed34d4b35696f73..ef0242942838fcca737a10fafbafa61bf520b532 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -174,6 +174,11 @@ class CompiledProgram(object): self._exec_strategy.num_threads = cpu_num * 2 trainers_endpoints = self._program._trainers_endpoints + + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True + if self._build_strategy.num_trainers > 1 and trainers_endpoints: assert self._build_strategy.num_trainers == len( trainers_endpoints), "num_trainers == len(end_points)" diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 45f5f6ea87e406da07e04194f76ecde84db7bd75..c0b0ad8a202b82183de9ec1edd43cb10db10fb5c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1725,18 +1725,19 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + # @deprecated(the python memory optimize transpiler is deprecated) # whether the program is optimized by memory_optimize_transpiler - self.__is_optimized = False + self.__is_mem_optimized = False @property - def _is_optimized(self): + def _is_mem_optimized(self): # if the program is optimized, operator input/outputs # maybe same, which conflict with save_inference_model. - return self.__is_optimized + return self.__is_mem_optimized - @_is_optimized.setter - def _is_optimized(self, target): - self.__is_optimized = target + @_is_mem_optimized.setter + def _is_mem_optimized(self, target): + self.__is_mem_optimized = target @property def op_role(self): diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 3ae7fddaace8555a8d2998d69698e4d2038494eb..9d027ce901b91b31169de3b5468cff8ac9466849 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -931,7 +931,7 @@ def save_inference_model(dirname, if main_program is None: main_program = default_main_program() - if main_program._is_optimized: + if main_program._is_mem_optimized: warnings.warn( "save_inference_model must put before you call memory_optimize. \ the memory_optimize will modify the original program, \ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index da18b4e51f206a1983bf368d0044a6442ab2fbac..52b260efd15066a114a8146106685043654c91ea 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,7 @@ class ParallelExecutor(object): else framework.default_main_program() # FIXME(dzhwinter): enable_inplace should be after memory_optimize # if turn on python memory optimize, turn off the inplace_pass. - build_strategy.enable_inplace = False if main._is_optimized else True + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index d260afcd623610f0fb27a6c62f831e3d38798ae1..def73d7072c8d0c95f5196f4ecf90f2174234ba7 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -108,7 +108,7 @@ class TestSaveInferenceModel(unittest.TestCase): exe.run(init_program, feed={}, fetch_list=[]) memory_optimize(program, print_log=True) - self.assertEqual(program._is_optimized, True) + self.assertEqual(program._is_mem_optimized, True) # will print warning message save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index fc8dafbe976533fb8b3601a5a937074ed084a66f..52c1aea288fa2bb7478ad14186367900c05f64e7 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -540,7 +540,7 @@ def memory_optimize(input_program, if skip_opt_set is not None: skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) - input_program._is_optimized = True + input_program._is_mem_optimized = True for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) @@ -560,6 +560,6 @@ def release_memory(input_program, skip_opt_set=None): None """ cfgs = _get_cfgs(input_program) - input_program._is_optimized = True + input_program._is_mem_optimized = True for cfg in cfgs: cfg.release_memory(skip_opt_set=skip_opt_set)