From 8f3bf905f9d19dd7253e1fbf94d6351d4ddeda86 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 18 Feb 2019 23:35:08 +0800 Subject: [PATCH] cherry-picked changes. test=release/1.3 --- cmake/flags.cmake | 3 +- paddle/fluid/framework/details/CMakeLists.txt | 15 +- .../fluid/framework/details/build_strategy.cc | 4 +- .../framework/details/inplace_op_pass.cc | 35 +- .../fluid/framework/details/inplace_op_pass.h | 15 +- .../details/memory_optimize_helper.cc | 392 +++++++++++++-- .../details/memory_optimize_helper.h | 119 +++-- .../details/memory_optimize_helper_test.cc | 463 +++++++++++++++++- .../framework/details/memory_optimize_pass.cc | 425 +++------------- .../framework/details/memory_optimize_pass.h | 57 +-- .../details/memory_optimize_pass_test.cc | 417 ---------------- .../details/sequential_execution_pass.cc | 1 + .../details/sequential_execution_pass.h | 2 - paddle/fluid/framework/inplace_op_inference.h | 2 +- .../framework/inplace_op_inference_test.cc | 33 +- paddle/fluid/framework/parallel_executor.cc | 11 +- .../operators/elementwise/elementwise_op.h | 20 +- python/paddle/fluid/compiler.py | 5 +- python/paddle/fluid/parallel_executor.py | 5 +- .../fluid/tests/unittests/CMakeLists.txt | 9 + .../unittests/parallel_executor_test_base.py | 2 +- .../test_fuse_elewise_add_act_pass.py | 4 + .../test_ir_memory_optimize_transformer.py | 48 ++ 23 files changed, 1135 insertions(+), 952 deletions(-) delete mode 100644 paddle/fluid/framework/details/memory_optimize_pass_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 81e7868a6..36b533aa4 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") endif() - endif() + endif() endif() endfunction() @@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif(NOT WIN32) if (APPLE) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 6621a59d3..dc308fd25 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +if(WITH_GPU) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) +else() +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) +endif() + cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) -cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass) - +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 51ce97327..0c823b9ca 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -206,8 +206,6 @@ std::unique_ptr BuildStrategy::Apply( new std::vector(main_program.Block(0).AllOps()); graph->Set>(kAllOpDescs, all_op_descs); // take ownership - graph->Set(kGraphNodePool, - new GraphNodePool); // take ownership pass->Erase(kAllOpDescs); pass->SetNotOwned>(kAllOpDescs, all_op_descs); @@ -242,7 +240,9 @@ std::unique_ptr BuildStrategy::Apply( continue; } } + VLOG(3) << "Start Apply Pass " << pass->Type(); graph = pass->Apply(std::move(graph)); + VLOG(3) << "Finish Apply Pass " << pass->Type(); } return graph; } diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 64368a5e8..c91fc81b2 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -49,7 +49,7 @@ DEFINE_bool( "If this option turns on, only these op in whitelist can be inplaced." "If it turns off, all of the running op can be candidate of inplaced op." "Such as scale, elementwise_add" - "By default, it's turned on"); + "By default, it's turned off"); DECLARE_string(memory_optimize_debug); @@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, } } -const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const { +const NodeSwapQueue InplacePass::TryInplaceModifyVar( + const std::string& var, const std::string& cache_var, const size_t& idx, + ir::Graph* graph) const { PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && var_nodes_[var].at(0)->Var() != nullptr); std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); var_desc->SetName(cache_var); - SSANodePair swap_nodes; + NodeSwapQueue swap_nodes; for (size_t i = idx; i < view_.AllOps().size(); ++i) { auto* op = view_.AllOps()[i]; @@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, return swap_nodes; } -void InplacePass::CommitModify(const SSANodePair& swap_nodes, +void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes, ir::Graph* graph) const { for (auto& pair : swap_nodes) { auto *node = pair.first, *cache_node = pair.second; @@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, } } -void InplacePass::WithdrawModify(const SSANodePair& nodes, +void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const { for (auto& pair : nodes) { auto *node = pair.first, *cache_node = pair.second; @@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) { // 2. track the nodes which used by parameter server. // these node can not be inplaced, otherwise trainer // pserver can not find each other name. - for (auto& node : g->Nodes()) { - if (!node->IsOp()) continue; - if (node->Name() == "send") { - for (auto& in : node->inputs) { - dup_nodes_.emplace(in->Name()); - } + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); } - if (node->Name() == "recv") { - for (auto& out : node->outputs) { + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) dup_nodes_.emplace(out->Name()); - } } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); } } diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 1abcf1f27..7be7f3118 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -56,7 +56,8 @@ class GraphView { std::map> adj_list_; }; -typedef std::vector> SSANodePair; +// swap pairs in sequence +typedef std::vector> NodeSwapQueue; class InplacePass : public ir::Pass { public: InplacePass(); @@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { void InitSSAGraphNodes() const; private: - const SSANodePair TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const; + const NodeSwapQueue TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; - void CommitModify(const SSANodePair&, ir::Graph* graph) const; + void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const; - void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; + void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const; void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, const size_t& idx) const; diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index b56ef021e..6126c168c 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,17 +13,114 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include +#include #include -#include +#include #include #include #include +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif // PADDLE_WITH_CUDA namespace paddle { namespace framework { namespace details { +using paddle::framework::VarDesc; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} -size_t NodeSizeInBytes(const VarDesc& node) { +size_t NodeSize(const VarDesc& node) { auto shape = node.GetShape(); int size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); @@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { return type_size * std::abs(size); } -size_t NodeSizeInBytes(ir::Node* n) { +size_t NodeSize(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - return NodeSizeInBytes(*desc); + return NodeSize(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) { std::string DebugString(ir::Node* var) { return DebugStringImpl(FindVarDescInBlock(var)); } -// return DebugString(var->Var()); } // NOTE(dzh): based ir node, if a large node has been reused // by a small size node, then next time it appear in pool, it will @@ -76,22 +172,26 @@ struct NodeComparator { bool operator()(ir::Node* lhs, ir::Node* rhs) const { auto* lhs_desc = FindVarDescInBlock(lhs); auto* rhs_desc = FindVarDescInBlock(rhs); + // match data type + if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { + return false; + } + // match shape auto lhs_shape = lhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape(); if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + return NodeSize(lhs) <= NodeSize(rhs); } else { return false; } } }; -void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { +void OrderedSet::Insert(ir::Node* var) { PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - PADDLE_ENFORCE(op->IsOp()); if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->second.insert(op); + mark_table_[var->Name()]->emplace_back(var); return; } @@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { auto var_shape = var_desc->GetShape(); int batch_size = static_cast(var_shape[0]); - NodeComparator compare_node; + NodeComparator functor; Iter it = nodes_.begin(); while (it != nodes_.end()) { - auto* cache_desc = FindVarDescInBlock(it->first); + auto& prev = it->front(); + auto* cache_desc = FindVarDescInBlock(prev); int cache_batch_size = cache_desc->GetShape()[0]; if ((cache_batch_size == -1 && batch_size == -1) || (cache_batch_size != -1 && batch_size != -1)) { - if (compare_node(it->first, var)) { + if (functor(prev, var)) { ++it; } else { break; @@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { } } - it = - nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + it = nodes_.insert(it, {var}); mark_table_[var->Name()] = it; } -int OrderedNodeList::GetIndex(ir::Node* var) { +int OrderedSet::GetNodeIndexInPool(ir::Node* var) { return std::distance(nodes_.begin(), mark_table_[var->Name()]); } -ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { +ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { ir::Node* found_node = nullptr; - NodeComparator compare_node; + NodeComparator functor; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - if (compare_node(var, it->first)) { - found_node = it->first; + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; break; } } return found_node; } -void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } +ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + auto it = + std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { + if (v.front() == prev) + return true; + else + return false; + }); + PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); + for (it = std::next(it); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + +bool OrderedSet::Has(ir::Node* var) const { + if (mark_table_.count(var->Name())) { + auto& node_in_samename = mark_table_.at(var->Name()); + auto iter = + std::find_if(node_in_samename->begin(), node_in_samename->end(), + [&](ir::Node* n) { return n->Name() == var->Name(); }); + return iter != node_in_samename->end(); + } + return false; +} -void OrderedNodeList::Erase(const std::string& var) { +void OrderedSet::Erase(const std::string& var) { PADDLE_ENFORCE(mark_table_.count(var)); nodes_.erase(mark_table_[var]); mark_table_.erase(var); } -std::string OrderedNodeList::ToString() const { +void OrderedSet::Erase(ir::Node* var) { + PADDLE_ENFORCE(var != nullptr); + Erase(var->Name()); +} + +std::string OrderedSet::ToString() const { std::stringstream ss; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - ss << DebugString(it->first) << " "; + for (auto& node : *it) { + ss << DebugString(node) << " "; + } } return ss.str(); } bool NodeCanReused(ir::Node* node) { + // valid the node is a var node if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; - // auto* desc = node->Var(); - bool flag = NodeCanReused(*node->Var()); + + bool flag = true; + // op output force generated in cpu, can not be reused. for (auto* op : node->inputs) { if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. flag &= framework::AttrReader(op->Op()->GetAttrMap()) .Get("force_cpu") == 0; } } + // var desc validation. + flag &= NodeCanReused(*node->Var()); return flag; } +int MinChunkSize() { + int size{0}; +#ifdef PADDLE_WITH_CUDA + size = platform::GpuMinChunkSize(); +#else + size = platform::CpuMinChunkSize(); +#endif // PADDLE_WITH_CUDA + return size; +} + bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); - if (node.Persistable() || type != proto::VarType::LOD_TENSOR || - node.GetShape().empty()) { + // only these types holds bulk of gpu memory + if (!(type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY)) { + return false; + } + // persistable variable is parameter + if (node.Persistable()) { + return false; + } + // shape < min_chunk_size is meaningless. + // further more, fetched loss always has size = 1 + // which should not be reused. + auto shape = node.GetShape(); + int size = std::abs( + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); + if (shape.empty() || size < MinChunkSize()) { return false; } // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad @@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) { return false; } +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + PADDLE_ENFORCE((output != nullptr && output->IsVar()), + "Output is empty!"); + if (output->Var() && output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 064183d61..dba96309f 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,41 +29,43 @@ namespace paddle { namespace framework { namespace details { -constexpr char kFetchedVars[] = "fetched_vars"; -constexpr char kGraphNodePool[] = "graph_node_pool"; +constexpr char kAllOpDescs[] = "all_op_descs"; -// NOTE(dzh): Variable and the operators use the var. -// for early delete pass. -// Because analysis var pass build base on ir::Node, which maybe released -// or modified between passes, so we use OpDesc* to mark ops. -using GraphNodePool = std::vector< - std::pair /* ops */>>; +std::vector SortOpLikeDescOrder(const ir::Graph& graph); -// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size is determined in runtime. -// the node batch_size equal -1 always ranking in the front than the node not. +// NOTE(dzh): A ordered set for node reuse in memory optimize. +// the orderedset sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size, which is determined in runtime. +// So the reuse happens between nodes who's batch_size both are -1 +// simultaneously or not. +// +// sort rule: +// rule 0 : smaller node ranking in front. +// rule 1 : batch_size equal -1 ranking in the front than the node not. +// // For example, // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. -// O(1) insert, delete -class OrderedNodeList { - public: - using NodePair = std::pair>; - using Iter = typename std::list::iterator; - using ConstIter = typename std::list::const_iterator; - void Insert(ir::Node* var, ir::Node* op); +class OrderedSet { + public: + // nodes with same name exists in pool. + using NodeVector = std::vector; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + void Insert(ir::Node* var); void Erase(ir::Node* var); - void Erase(const std::string& var); - - bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } - - bool Has(const std::string& var) { return mark_table_.count(var); } - - ir::Node* NodeMatch(ir::Node* var) const; + bool Has(ir::Node* var) const; + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + // find the bestfit shape node block with var. + ir::Node* FindBestFitNode(ir::Node* var) const; + ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; // map store non-const iterator, can not promise const - int GetIndex(ir::Node* var); + int GetNodeIndexInPool(ir::Node* var); // pool all node to string std::string ToString() const; @@ -69,18 +73,54 @@ class OrderedNodeList { Iter end() { return nodes_.end(); } ConstIter begin() const { return nodes_.begin(); } ConstIter end() const { return nodes_.end(); } - size_t size() const { return nodes_.size(); } - void Clear() { - mark_table_.clear(); - nodes_.clear(); - } + size_t size() const { return nodes_.size(); } private: // for searching. std::unordered_map mark_table_; - // node swap pairs. var -> ops dep var - std::list nodes_; + // node pool + std::list nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // IR Graph + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort }; // valid a tensor can be reuse or not @@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node); bool OpHasSubBlock(OpDesc* desc); // node memory size in bytes -size_t NodeSizeInBytes(ir::Node* n); +size_t NodeSize(ir::Node* n); // node memory size in bytes -size_t NodeSizeInBytes(const VarDesc&); +size_t NodeSize(const VarDesc&); std::string DebugString(ir::Node* var); +// NOTE(dzhwinter) +// after node reuse, the replaced node shape is +// different with its VarDesc. So need to find the +// correct VarDesc in Block. VarDesc* FindVarDescInBlock(ir::Node* n); +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + template class FilterVariableImpl { public: diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index f2b9baf14..3cfe297a7 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include #include +#include #include #include #include @@ -22,13 +23,19 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" namespace paddle { namespace framework { namespace details { -TEST(OrderedNodeList, Normal) { - OrderedNodeList pool; +TEST(OrderedSet, Normal) { + OrderedSet pool; std::vector> nodes; // clang-format off @@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { nodes.emplace_back(std::move(node)); } + // Insert for (auto& node : nodes) { - pool.Insert(node.get(), op.get()); + pool.Insert(node.get()); + } + + // Has/size + ASSERT_EQ(pool.size(), shapes.size()); + for (auto& node : nodes) { + ASSERT_TRUE(pool.Has(node.get())); } // assert its order and interface. @@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { std::cout << pool.ToString() << std::endl; ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); - ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); { auto v1 = block_desc->Var("11"); v1->SetShape({-1, 256, 56, 56}); std::unique_ptr node1 = ir::CreateNodeForTest(v1); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); + auto* cache = pool.FindBestFitNode(node1.get()); ASSERT_EQ(cache, nullptr); } { @@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) { v2->SetShape({-1, 2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v2); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] } { auto v3 = block_desc->Var("13"); v3->SetShape({2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v3); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] + } +} + +TEST(OrderedSet, FindBestFitNode) { + OrderedSet pool; + std::vector> nodes; + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + { + auto desc = block_desc->Var("a"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("b"); + desc->SetShape({128, 129}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("c"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get()); + } + + // FindNextBestFitNode + auto* n = nodes[0].get(); + auto* cache = pool.FindBestFitNode(n); + PADDLE_ENFORCE(cache->Name() == "a"); + cache = pool.FindNextBestFitNode(n, cache); + PADDLE_ENFORCE(cache->Name() == "c"); + cache = pool.FindNextBestFitNode(n, cache); + PADDLE_ENFORCE(cache->Name() == "b"); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); } } diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 33ca45668..b35b967c7 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -43,16 +43,10 @@ namespace paddle { namespace framework { namespace details { -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + CollectSkipVarsSet(nodes); cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_->LiveVariableAnalysis(); @@ -75,82 +69,67 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } for (auto& var : op->outputs) { - if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || - skip_set_.count(var->Name())) + if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) { + VLOG(3) << "Skip set contains variable of " << var->Name() + << "disable reuse on it. skipped"; continue; - ir::Node* cache = pool_.NodeMatch(var); - - if (var->Name() == FLAGS_memory_optimize_debug) { - VLOG(3) << "start match var " << DebugString(var) << " of op " - << op->Name(); - VLOG(3) << pool_.ToString(); - VLOG(3) << "matched in pool : " - << ((cache == nullptr) ? "False" : "True"); } + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.FindBestFitNode(var); + while (cache != nullptr && var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused. " + << cache->Name() << " is re-filled to the pool after " + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + cache = pool_.FindNextBestFitNode(var, cache); + } + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } - if (cache == nullptr) continue; - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." << var->Name() - << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - - int node_idx_in_pool = pool_.GetIndex(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - // update CFG Graph on the fly. - // reused var maybe re-fill into the pool - cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); - // NOTE(dzhwinter): we need to both update the ProgramDesc - // and IR Graph. because op_desc/var_desc is used in CreateOp, - // CreateVar when running happens. But IR Graph - // define the dependence relationship between nodes. - RenameVarInGraphDesc(var->Name(), cache->Name(), idx); - RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); - - pool_.Erase(cache); - } - // fill the pool - std::unordered_set unlived_vars; - for (auto var : cfg_->LiveIn(op)) { - if (cfg_->LiveOut(op).count(var) == 0) { - unlived_vars.emplace(var); + if (cache != nullptr) { + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // NOTE(dzhwinter): update the ProgramDesc/IR Graph + // and the CFG Graph on the fly. + // + // IR Graph define the dependence relationship between nodes. + // + // ProgramDesc defines the input/output vars. Its used in + // CreateOp, CreateVar when running happens. + // + // CFG Graph store the liveness information, when reuse happens + // we also need to update the variable liveness. + const std::string var_name = var->Name(); + const std::string cache_name = cache->Name(); + + cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); + RenameVarInGraphDesc(var_name, cache_name, idx); + RenameVarInGraphNode(var_name, cache_name, idx, graph.get()); + pool_.Erase(cache_name); } } - for (auto var : unlived_vars) { - ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeByName(var, op); + if (var_node == nullptr || var_node->IsCtrlVar()) continue; if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node, op); + pool_.Insert(var_node); } } } } graph->ResolveHazard(var_nodes_); - // For early delete pass. use GraphNodePool load the unlived vars. - // 1. find all deps op for each unlived var in memory pool. - for (auto& op : graph->Nodes()) { - for (auto& var : op->inputs) { - if (pool_.Has(var)) { - pool_.Insert(var, op); - } - } - } - // 2. convert ir node based memory pool to graph node - // because Node* maybe released bettwen passes. - auto& graph_pool = graph->Get(kGraphNodePool); - for (auto it = pool_.begin(); it != pool_.end(); ++it) { - std::unordered_set descs; - for (auto& op : it->second) { - PADDLE_ENFORCE(op->IsOp()); - descs.insert(op->Op()); - } - graph_pool.push_back(std::make_pair(it->first->Name(), descs)); - } - return graph; } @@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { PADDLE_ENFORCE(sub_op != nullptr); for (auto* var : sub_op->outputs) { if (NodeCanReused(var)) { - ir::Node* cache = pool_.NodeMatch(var); + ir::Node* cache = pool_.FindBestFitNode(var); if (cache != nullptr) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { continue; } - int node_idx_in_pool = pool_.GetIndex(cache); + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d", std::to_string(sub_reuse_id++), DebugString(var), @@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { } } -std::unordered_set MemoryOptimizePass::GetSubBlockVars( +void MemoryOptimizePass::CollectSkipVarsSet( const std::unordered_set& nodes) const { - std::unordered_set vars; + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; for (auto& op : nodes) { if (!op->IsOp() || op->Op() == nullptr) continue; auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); } - return vars; } void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, @@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, // redirect the input to the latest version of cache_var for (auto* node : op->inputs) { if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); + ir::Node* cache_node = var_nodes_[cache_var].back(); // swap node to cache_node cache_node->outputs.insert(cache_node->outputs.end(), @@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, auto* prev_op = node->inputs[0]; std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, cache_node); - cache_node->inputs.emplace_back(prev_op); for (auto* next_op : node->outputs) { std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, cache_node); } + + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); } } @@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, cache_node); } - } - } - } - - // release node of unused var in graph - for (auto* node : var_nodes_[var]) { - graph->RemoveNode(node); - } - var_nodes_.at(var).clear(); -} - -std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(kAllOpDescs), - "Graph has no attribute of kAllOpDescs."); - // 1. get op desc order - auto& op_descs = graph.Get>(kAllOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map op_deps; - std::list ready_ops; - std::unordered_map> pending_ops; - - for (auto* op : ops) { - std::unordered_set preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector ret; - std::list op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair& p) { return p.second == 0; })); - - return ret; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - } - } -} - -const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::vector ControlFlowGraph::Ops() const { return ops_; } - -std::vector& ControlFlowGraph::Ops() { return ops_; } -ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - if (output->Name() == name) { - found_node = output; + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); } } } - return found_node; } } // namespace details @@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, REGISTER_PASS(memory_optimize_pass, paddle::framework::details::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index b3e026e0b..593ffc10f 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -32,20 +32,15 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph); - -class ControlFlowGraph; class MemoryOptimizePass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - - private: // fill the variable map(var_nodes) by version. void InitSSAGraphNodes() const; + + private: // update program descs void RenameVarInGraphDesc(const std::string& var, const std::string& cache_var, size_t idx) const; @@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass { ir::Graph* graph) const; void SubGraphOptimize(OpDesc* op_desc) const; - // scan subblock and collect the output/input variables. - std::unordered_set GetSubBlockVars( - const std::unordered_set&) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set&) const; private: // Reuse Node Pool, Owned. - mutable OrderedNodeList pool_; + mutable OrderedSet pool_; // controlflow Graph mutable std::unique_ptr cfg_; // skip set @@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { mutable std::map> var_nodes_; }; -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // For IR Graph in parallelexecutor - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set LiveIn(ir::Node* op) const; - const std::set LiveOut(ir::Node* op) const; - const std::set Use(ir::Node* op) const; - const std::vector Ops() const; - std::vector& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - using NodeListMap = std::unordered_map>; - using VarSetMap = std::map>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - - std::vector ops_; // op sequence by topology sort -}; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc deleted file mode 100644 index 3d3dfa935..000000000 --- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_optimize_pass.h" -#include -#include -#include -#include "glog/logging.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/details/graph_test_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" - -REGISTER_OPERATOR(sum, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(assign, paddle::framework::DummyOp, - paddle::framework::AssignOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -/* - https://en.wikipedia.org/wiki/Live_variable_analysis - Create a customed classical dependency graph, left row is the instruction - number. - 1. a = 1 - 2. b = a - 3. c = a - 4. d = b + c - 5. e = d - - a--------+ - | | - b c - | | - d--------+ - | - e - Then analysis these variable's liveness range - */ - -namespace paddle { -namespace framework { -namespace details { - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -inline static ProgramDesc FillProgramDesc() { - ProgramDesc prog; - prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"b"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"c"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"d"}); - op->SetOutput("Out", {"e"}); - } - return prog; -} - -TEST(CFGGraph, IRGraph) { - // prepare ir graph - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - ControlFlowGraph cfg(graph); - cfg.LiveVariableAnalysis(); - - // test assign op - ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); - - // test assign op - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); - - // test sum op - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); - ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); - - // test assign op - ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); - ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); -} - -// 1. normal test -TEST(SortOpLikeDescOrder, NormalTest) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto nodes = SortOpLikeDescOrder(graph); - auto op_descs = prog.Block(0).AllOps(); - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 2. remove some op_desc -TEST(SortOpLikeDescOrder, RemoveOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - auto nodes = graph.Nodes(); - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->outputs.back()->Name() == "e") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - graph.RemoveNode(found_node); - graph.RemoveNode(e); - - // other node keeps the same order - auto remain_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < remain_nodes.size(); ++i) { - auto node = remain_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 3. add some op_desc -TEST(SortOpLikeDescOrder, AddOpDesc) { - auto prog = FillProgramDesc(); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // cached desc different with real one - // mimic the intermidiete pass modify the programdesc. - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto op_descs = prog.Block(0).AllOps(); - - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - op_descs.insert(op_descs.begin() + 4, op); - - auto nodes = SortOpLikeDescOrder(graph); - - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 4. add and delete some op_desc -TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // remove sum node - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - auto nodes = graph.Nodes(); - for (auto node : nodes) { - if (node->Name() == "sum") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* c = find_node_in_graph("c"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(c->outputs.begin(), c->outputs.end(), found_node); - ir::Node* pending_op = found_node->outputs[0]->outputs[0]; - graph.RemoveNode(e); - graph.RemoveNode(pending_op); - graph.RemoveNode(found_node); - } - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - op_descs.insert(op_descs.begin() + 2, op); - - // check the order - auto mynodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < mynodes.size(); ++i) { - auto node = mynodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 5. add and replace some op_desc inplace. -TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - auto op_descs = prog.Block(0).AllOps(); - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - - op_descs.emplace_back(op); - - // replace op_desc inplace - auto nodes = graph.Nodes(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->Op() && node->Name() == "assign") { - if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { - found_node = node; - break; - } - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(e->inputs.begin(), e->inputs.end(), found_node); - graph.RemoveNode(found_node); - } - op_descs.erase(op_descs.begin() + 3); - - auto replace_op = prog.MutableBlock(0)->AppendOp(); - replace_op->SetType("sum"); - replace_op->SetInput("X", {"d", "d1"}); - replace_op->SetOutput("Out", {"e"}); - { - ir::Node* sum2 = graph.CreateOpNode(replace_op); - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - ir::Node* d1 = find_node_in_graph("d1"); - sum2->inputs.emplace_back(d); - sum2->inputs.emplace_back(d1); - sum2->outputs.emplace_back(e); - e->inputs.emplace_back(sum2); - d->outputs.emplace_back(sum2); - d1->outputs.emplace_back(sum2); - } - - op_descs.emplace_back(replace_op); - // compare op order - auto graph_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < graph_nodes.size(); ++i) { - auto node = graph_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index cc2c8bfef..879fb29d5 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index a04c08bc2..ea3034877 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -21,8 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - class SequentialExecutionPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index 03ab2a2b6..a3ccf677c 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { return in.Name() != out.Name() && details::NodeCanReused(in) && details::NodeCanReused(out) && - details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); + details::NodeSize(out) <= details::NodeSize(in); } }; diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index 121f648a5..bf9d1dcd3 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { op->SetOutput("Out", {"test2_out"}); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) { prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); std::unordered_map expects = { {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f61c9e3a9..ff7ef0cce 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,14 +171,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - if (build_strategy_.memory_early_delete_) { - auto early_delete_pass = - ir::PassRegistry::Instance().Get("memory_early_delete_pass"); - early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = early_delete_pass->Apply(std::move(graph)); - } - VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); + VLOG(10) << "Eager Deletion Threshold " + << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { for (size_t i = 0; i < graphs.size(); ++i) { graphs[i] = member_->PrepareGCAndRefCnts( @@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle -USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index d04bb8f33..91e441526 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { } }; +class ElementwiseGradOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map ret; + if (block->HasVar(framework::GradVarName("X")) && + block->HasVar(framework::GradVarName("Out"))) { + ret[framework::GradVarName("Out")] = framework::GradVarName("X"); + } + return ret; + } +}; + } // namespace operators } // namespace paddle @@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { op_type##GradMaker, \ ::paddle::operators::ElementwiseOpInplace); \ REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad) + ::paddle::operators::ElementwiseOpExplicitGrad, \ + ::paddle::operators::ElementwiseGradOpInplace) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index ef0242942..c9885b43b 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -177,7 +177,10 @@ class CompiledProgram(object): # FIXME(dzhwinter): enable_inplace should be after memory_optimize # if turn on python memory optimize, turn off the inplace_pass. - self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True + if self._build_strategy.memory_optimize is True: + self._build_strategy.memory_optimize = False if main._is_mem_optimized else True + if self._build_strategy.enable_inplace is True: + self._build_strategy.enable_inplace = False if main._is_mem_optimized else True if self._build_strategy.num_trainers > 1 and trainers_endpoints: assert self._build_strategy.num_trainers == len( diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 52b260efd..29b7f6aba 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,10 @@ class ParallelExecutor(object): else framework.default_main_program() # FIXME(dzhwinter): enable_inplace should be after memory_optimize # if turn on python memory optimize, turn off the inplace_pass. - build_strategy.enable_inplace = False if main._is_mem_optimized else True + if build_strategy.memory_optimize is True: + build_strategy.memory_optimize = False if main._is_mem_optimized else True + if build_strategy.enable_inplace is True: + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 699181d01..289a48aac 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) +list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) +if(NOT WIN32) +py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) +endif() if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # change the timeout from 600 to 1200, because in debug mode, this test need more time. + set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200) + endif() endif() + if (WITH_NGRAPH) add_subdirectory(ngraph) endif() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c429c8af7..a94487e67 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv - build_strategy.memory_optimize = use_ir_memory_optimize + build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize # python memory optimization is conflict with inplace pass. # Use ir graph memory optimization after inplace pass is the correct way. build_strategy.enable_inplace = False if memory_opt else enable_inplace diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py index 03471a443..c1fb53ecf 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py @@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase): regularization=fluid.regularizer.L2Decay(1e-6)) return optimizer + # NOTE(dzh): + # need to make it compatible with elewise fuse act not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, fuse_elewise_add_act_ops=False, memory_opt=False, + use_ir_memory_optimize=False, optimizer=_optimizer) fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( model, @@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, fuse_elewise_add_act_ops=True, memory_opt=False, + use_ir_memory_optimize=False, optimizer=_optimizer) for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py new file mode 100644 index 000000000..c0f480e34 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core + +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio' + +from test_parallel_executor_transformer import TestTransformer +from test_parallel_executor_transformer import transformer + + +# NOTE(dzhwinter): test diferent strategy colisions. +# open the eager delete tensor strategy by default. +class TestTransformerWithIR(TestTransformer): + def test_main(self): + if core.is_compiled_with_cuda(): + # check python transpiler + self.check_network_convergence( + transformer, + use_cuda=True, + memory_opt=True, + use_ir_memory_optimize=False) + # check IR memory optimize + self.check_network_convergence( + transformer, + use_cuda=True, + memory_opt=False, + use_ir_memory_optimize=True) + + +if __name__ == '__main__': + unittest.main() -- GitLab