diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 81e7868a6ad3fee16911a49ff9d1394a103706c5..5895657ece63471c72d3e7eb829cf44702d3f8c7 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -21,12 +21,13 @@ function(CheckCompilerCXX11Flag) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") endif() - endif() + endif() endif() endfunction() CheckCompilerCXX11Flag() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 6621a59d37a670f7025507faeab5b9897794a72e..e88084424baf7eb5cd1d67ea19966866d71ec3eb 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) -cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass) - +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 51ce9732722efa44d2489f5b77694094e58c8775..f8030c53f72bc8a6f007c1eb6a3072abd8037de2 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -206,8 +206,6 @@ std::unique_ptr BuildStrategy::Apply( new std::vector(main_program.Block(0).AllOps()); graph->Set>(kAllOpDescs, all_op_descs); // take ownership - graph->Set(kGraphNodePool, - new GraphNodePool); // take ownership pass->Erase(kAllOpDescs); pass->SetNotOwned>(kAllOpDescs, all_op_descs); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index e3e06a5614ddee0bea342bc3608691b7a32326cc..e62e3edcef710df739c53b5d848f5aceb4f2db4e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -77,9 +77,6 @@ struct BuildStrategy { bool fuse_relu_depthwise_conv_{false}; bool memory_optimize_{false}; - - bool memory_early_delete_{false}; - // TODO(dzhwinter): // make enable_inplace, memory_optimize_ // memory_early_delete_ true by default diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 78c5d5b50e606daa963e728355dc1bce83cd5484..b0c5968499be3a959dd6103424f25056a6dc2282 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, } } -const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const { +const NodeSwapQueue InplacePass::TryInplaceModifyVar( + const std::string& var, const std::string& cache_var, const size_t& idx, + ir::Graph* graph) const { PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && var_nodes_[var].at(0)->Var() != nullptr); std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); var_desc->SetName(cache_var); - SSANodePair swap_nodes; + NodeSwapQueue swap_nodes; for (size_t i = idx; i < view_.AllOps().size(); ++i) { auto* op = view_.AllOps()[i]; @@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, return swap_nodes; } -void InplacePass::CommitModify(const SSANodePair& swap_nodes, +void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes, ir::Graph* graph) const { for (auto& pair : swap_nodes) { auto *node = pair.first, *cache_node = pair.second; @@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, } } -void InplacePass::WithdrawModify(const SSANodePair& nodes, +void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const { for (auto& pair : nodes) { auto *node = pair.first, *cache_node = pair.second; diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 1abcf1f279e225839d440ff9c6840ce9b8a6547f..7be7f311852d2b64ce95e1a939371760d03d296b 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -56,7 +56,8 @@ class GraphView { std::map> adj_list_; }; -typedef std::vector> SSANodePair; +// swap pairs in sequence +typedef std::vector> NodeSwapQueue; class InplacePass : public ir::Pass { public: InplacePass(); @@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { void InitSSAGraphNodes() const; private: - const SSANodePair TryInplaceModifyVar(const std::string& var, - const std::string& cache_var, - const size_t& idx, - ir::Graph* graph) const; + const NodeSwapQueue TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; - void CommitModify(const SSANodePair&, ir::Graph* graph) const; + void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const; - void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; + void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const; void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, const size_t& idx) const; diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc deleted file mode 100644 index 69f8f705484450b0544291b19027eb174d7eeb8f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_early_delete_pass.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_early_delete_pass.h" -#include -#include -#include -#include "paddle/fluid/framework/details/memory_optimize_helper.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle { -namespace framework { -namespace details { - -static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { - std::queue queue; - queue.push(var_in); - do { - auto* var = queue.front(); - queue.pop(); - for (auto* op : var->PendingOps()) { - auto* compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) { - return compute_op; - } - for (auto* out_var : op->Outputs()) { - queue.push(out_var); - } - } - } while (!queue.empty()); - return nullptr; -} - -std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( - std::unique_ptr graph) const { - auto& graph_pool = Get(kGraphNodePool); - auto& gcs = Get(kGarbageCollector); - - std::unordered_map> unlived_vars; - unlived_vars.reserve(graph_pool.size()); - for (auto& pair : graph_pool) { - unlived_vars.insert(std::make_pair(pair.first, pair.second)); - } - - auto compare_and_insert_early_delete_op = [&]( - OpHandleBase* op, const std::vector& vars) { - if (unlived_vars.empty()) return; - // unlived vars can be deleted after the last used op has finished. - auto* compute_op = dynamic_cast(op); - const auto& places = Get>(kAllPlaces); - for (auto& var : vars) { - auto* var_handle = dynamic_cast(var); - auto var_name = var->Node()->Name(); - auto& var_place = var_handle->place(); - if (unlived_vars.count(var_name) == 0) continue; - if (!unlived_vars[var_name].empty()) { - if (compute_op != nullptr && - unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { - unlived_vars[var_name].erase(compute_op->Node()->Op()); - } - continue; - } - - if (var_handle == nullptr || !var_handle->Node()->IsVar() || - var_handle->Node()->IsCtrlVar()) - continue; - - // shameless copyed from reference count pass. - if (compute_op == nullptr) { - // use next computation op scope - compute_op = FindNextComputationOpHandle(var_handle); - } - auto* early_delete_node = - graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); - GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); - auto* early_delete_handle = new EarlyDeleteOpHandle( - early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); - if (compute_op->Outputs().empty()) { - auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - compute_op->AddOutput(dep_var); - graph->Get(kGraphDepVars).emplace(dep_var); - } - early_delete_handle->AddInput(compute_op->Outputs().front()); - VLOG(5) << "Add early delete op " << var_name << " to Operator" - << compute_op->Name(); - } - }; - - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto& op : all_ops) { - compare_and_insert_early_delete_op(op, op->Inputs()); - compare_and_insert_early_delete_op(op, op->Outputs()); - } - return graph; -} - -} // namespace details -} // namespace framework -} // namespace paddle - -REGISTER_PASS(memory_early_delete_pass, - paddle::framework::details::MemoryEarlyDeletePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) - .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h deleted file mode 100644 index 8215aa1b2baa223a111f9050d5488c5fc8ac0e6e..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_early_delete_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/details/early_delete_op_handle.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -class MemoryEarlyDeletePass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index b56ef021ef508a43aac082acbcfa6f543635203e..6345ba335997ec42ebc63f90e9bf6a3ed2648edc 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,17 +13,108 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include #include #include #include #include +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { namespace details { +using paddle::framework::VarDesc; -size_t NodeSizeInBytes(const VarDesc& node) { +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +size_t NodeSize(const VarDesc& node) { auto shape = node.GetShape(); int size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); @@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { return type_size * std::abs(size); } -size_t NodeSizeInBytes(ir::Node* n) { +size_t NodeSize(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - return NodeSizeInBytes(*desc); + return NodeSize(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) { std::string DebugString(ir::Node* var) { return DebugStringImpl(FindVarDescInBlock(var)); } -// return DebugString(var->Var()); } // NOTE(dzh): based ir node, if a large node has been reused // by a small size node, then next time it appear in pool, it will @@ -80,18 +170,17 @@ struct NodeComparator { auto rhs_shape = rhs_desc->GetShape(); if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + return NodeSize(lhs) <= NodeSize(rhs); } else { return false; } } }; -void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { +void OrderedSet::Insert(ir::Node* var) { PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - PADDLE_ENFORCE(op->IsOp()); if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->second.insert(op); + mark_table_[var->Name()]->emplace_back(var); return; } @@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { auto var_shape = var_desc->GetShape(); int batch_size = static_cast(var_shape[0]); - NodeComparator compare_node; + NodeComparator functor; Iter it = nodes_.begin(); while (it != nodes_.end()) { - auto* cache_desc = FindVarDescInBlock(it->first); + auto& prev = it->front(); + auto* cache_desc = FindVarDescInBlock(prev); int cache_batch_size = cache_desc->GetShape()[0]; if ((cache_batch_size == -1 && batch_size == -1) || (cache_batch_size != -1 && batch_size != -1)) { - if (compare_node(it->first, var)) { + if (functor(prev, var)) { ++it; } else { break; @@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { } } - it = - nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + it = nodes_.insert(it, {var}); mark_table_[var->Name()] = it; } -int OrderedNodeList::GetIndex(ir::Node* var) { +int OrderedSet::GetNodeIndexInPool(ir::Node* var) { return std::distance(nodes_.begin(), mark_table_[var->Name()]); } -ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { +ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { ir::Node* found_node = nullptr; - NodeComparator compare_node; + NodeComparator functor; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - if (compare_node(var, it->first)) { - found_node = it->first; + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; break; } } return found_node; } -void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } +bool OrderedSet::Has(ir::Node* var) const { + if (mark_table_.count(var->Name())) { + auto& node_in_samename = mark_table_.at(var->Name()); + auto iter = + std::find_if(node_in_samename->begin(), node_in_samename->end(), + [&](ir::Node* n) { return n->Name() == var->Name(); }); + return iter != node_in_samename->end(); + } + return false; +} -void OrderedNodeList::Erase(const std::string& var) { - PADDLE_ENFORCE(mark_table_.count(var)); - nodes_.erase(mark_table_[var]); - mark_table_.erase(var); +void OrderedSet::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); } -std::string OrderedNodeList::ToString() const { +std::string OrderedSet::ToString() const { std::stringstream ss; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - ss << DebugString(it->first) << " "; + for (auto& node : *it) { + ss << DebugString(node) << " "; + } } return ss.str(); } bool NodeCanReused(ir::Node* node) { + // valid the node is a var node if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; - // auto* desc = node->Var(); - bool flag = NodeCanReused(*node->Var()); + + bool flag = true; + // op output force generated in cpu, can not be reused. for (auto* op : node->inputs) { if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. flag &= framework::AttrReader(op->Op()->GetAttrMap()) .Get("force_cpu") == 0; } } + // var desc validation. + flag &= NodeCanReused(*node->Var()); return flag; } bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); - if (node.Persistable() || type != proto::VarType::LOD_TENSOR || - node.GetShape().empty()) { + if (!(type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY)) { + return false; + } + if (node.Persistable() || node.GetShape().empty()) { return false; } // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad @@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) { return false; } +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 064183d61ea7386b6b45034c90fd7569a8647f60..0bfaf827fea84030de48a9984197f5b39f5c9261 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,41 +29,41 @@ namespace paddle { namespace framework { namespace details { -constexpr char kFetchedVars[] = "fetched_vars"; -constexpr char kGraphNodePool[] = "graph_node_pool"; +constexpr char kAllOpDescs[] = "all_op_descs"; -// NOTE(dzh): Variable and the operators use the var. -// for early delete pass. -// Because analysis var pass build base on ir::Node, which maybe released -// or modified between passes, so we use OpDesc* to mark ops. -using GraphNodePool = std::vector< - std::pair /* ops */>>; +std::vector SortOpLikeDescOrder(const ir::Graph& graph); -// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size is determined in runtime. -// the node batch_size equal -1 always ranking in the front than the node not. +// NOTE(dzh): A ordered set for node reuse in memory optimize. +// the orderedset sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size, which is determined in runtime. +// So the reuse happens between nodes who's batch_size both are -1 +// simultaneously or not. +// +// sort rule: +// rule 0 : smaller node ranking in front. +// rule 1 : batch_size equal -1 ranking in the front than the node not. +// // For example, // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. -// O(1) insert, delete -class OrderedNodeList { - public: - using NodePair = std::pair>; - using Iter = typename std::list::iterator; - using ConstIter = typename std::list::const_iterator; - void Insert(ir::Node* var, ir::Node* op); +class OrderedSet { + public: + // nodes with same name exists in pool. + using NodeVector = std::vector; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + void Insert(ir::Node* var); void Erase(ir::Node* var); - - void Erase(const std::string& var); - - bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } - - bool Has(const std::string& var) { return mark_table_.count(var); } - - ir::Node* NodeMatch(ir::Node* var) const; + bool Has(ir::Node* var) const; + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + // find the bestfit shape node block with var. + ir::Node* FindBestFitNode(ir::Node* var) const; // map store non-const iterator, can not promise const - int GetIndex(ir::Node* var); + int GetNodeIndexInPool(ir::Node* var); // pool all node to string std::string ToString() const; @@ -69,18 +71,54 @@ class OrderedNodeList { Iter end() { return nodes_.end(); } ConstIter begin() const { return nodes_.begin(); } ConstIter end() const { return nodes_.end(); } - size_t size() const { return nodes_.size(); } - void Clear() { - mark_table_.clear(); - nodes_.clear(); - } + size_t size() const { return nodes_.size(); } private: // for searching. std::unordered_map mark_table_; - // node swap pairs. var -> ops dep var - std::list nodes_; + // node pool + std::list nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // IR Graph + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort }; // valid a tensor can be reuse or not @@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node); bool OpHasSubBlock(OpDesc* desc); // node memory size in bytes -size_t NodeSizeInBytes(ir::Node* n); +size_t NodeSize(ir::Node* n); // node memory size in bytes -size_t NodeSizeInBytes(const VarDesc&); +size_t NodeSize(const VarDesc&); std::string DebugString(ir::Node* var); +// NOTE(dzhwinter) +// after node reuse, the replaced node shape is +// different with its VarDesc. So need to find the +// correct VarDesc in Block. VarDesc* FindVarDescInBlock(ir::Node* n); +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + template class FilterVariableImpl { public: diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index f2b9baf14a34ace9cc860797280dbd519dfa4f2a..5c13dda9e5491044d2bcbed4b24d438cc8b8a413 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include #include +#include #include #include #include @@ -22,13 +23,19 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" namespace paddle { namespace framework { namespace details { -TEST(OrderedNodeList, Normal) { - OrderedNodeList pool; +TEST(OrderedSet, Normal) { + OrderedSet pool; std::vector> nodes; // clang-format off @@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { nodes.emplace_back(std::move(node)); } + // Insert for (auto& node : nodes) { - pool.Insert(node.get(), op.get()); + pool.Insert(node.get()); + } + + // Has/size + ASSERT_EQ(pool.size(), shapes.size()); + for (auto& node : nodes) { + ASSERT_TRUE(pool.Has(node.get())); } // assert its order and interface. @@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { std::cout << pool.ToString() << std::endl; ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); - ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); { auto v1 = block_desc->Var("11"); v1->SetShape({-1, 256, 56, 56}); std::unique_ptr node1 = ir::CreateNodeForTest(v1); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); + auto* cache = pool.FindBestFitNode(node1.get()); ASSERT_EQ(cache, nullptr); } { @@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) { v2->SetShape({-1, 2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v2); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] } { auto v3 = block_desc->Var("13"); v3->SetShape({2, 5}); std::unique_ptr node1 = ir::CreateNodeForTest(v3); node1->inputs.emplace_back(op.get()); - auto* cache = pool.NodeMatch(node1.get()); - ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] + } +} +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); } } diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 85de14a60a8fe6958794f0ac25768b9da1943f9d..41e4a834df0abab069ab1f6cd21c8479b911250c 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -43,11 +43,6 @@ namespace paddle { namespace framework { namespace details { -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); @@ -77,7 +72,7 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || skip_set_.count(var->Name())) continue; - ir::Node* cache = pool_.NodeMatch(var); + ir::Node* cache = pool_.FindBestFitNode(var); if (var->Name() == FLAGS_memory_optimize_debug) { VLOG(3) << "start match var " << DebugString(var) << " of op " @@ -95,11 +90,12 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( << "replace it again. Skip this candidate."; continue; - int node_idx_in_pool = pool_.GetIndex(cache); + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d", std::to_string(reuse_id++), DebugString(var), DebugString(cache), node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. // reused var maybe re-fill into the pool cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); @@ -112,6 +108,7 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( pool_.Erase(cache); } + // fill the pool std::unordered_set unlived_vars; for (auto var : cfg_->LiveIn(op)) { @@ -120,36 +117,15 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } } for (auto var : unlived_vars) { - ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + ir::Node* var_node = cfg_->GetNodeByName(var, op); if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node, op); + pool_.Insert(var_node); } } } } graph->ResolveHazard(var_nodes_); - // For early delete pass. use GraphNodePool load the unlived vars. - // 1. find all deps op for each unlived var in memory pool. - for (auto& op : graph->Nodes()) { - for (auto& var : op->inputs) { - if (pool_.Has(var)) { - pool_.Insert(var, op); - } - } - } - // 2. convert ir node based memory pool to graph node - // because Node* maybe released bettwen passes. - auto& graph_pool = graph->Get(kGraphNodePool); - for (auto it = pool_.begin(); it != pool_.end(); ++it) { - std::unordered_set descs; - for (auto& op : it->second) { - PADDLE_ENFORCE(op->IsOp()); - descs.insert(op->Op()); - } - graph_pool.push_back(std::make_pair(it->first->Name(), descs)); - } - return graph; } @@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { PADDLE_ENFORCE(sub_op != nullptr); for (auto* var : sub_op->outputs) { if (NodeCanReused(var)) { - ir::Node* cache = pool_.NodeMatch(var); + ir::Node* cache = pool_.FindBestFitNode(var); if (cache != nullptr) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { continue; } - int node_idx_in_pool = pool_.GetIndex(cache); + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d", std::to_string(sub_reuse_id++), DebugString(var), @@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, var_nodes_.at(var).clear(); } -std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(kAllOpDescs), - "Graph has no attribute of kAllOpDescs."); - // 1. get op desc order - auto& op_descs = graph.Get>(kAllOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map op_deps; - std::list ready_ops; - std::unordered_map> pending_ops; - - for (auto* op : ops) { - std::unordered_set preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector ret; - std::list op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair& p) { return p.second == 0; })); - - return ret; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - } - } -} - -const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::vector ControlFlowGraph::Ops() const { return ops_; } - -std::vector& ControlFlowGraph::Ops() { return ops_; } - -ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - if (output->Name() == name) { - found_node = output; - } - } - } - return found_node; -} - } // namespace details } // namespace framework } // namespace paddle REGISTER_PASS(memory_optimize_pass, paddle::framework::details::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kGraphNodePool) .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index 3d6b1897f3b5106054b8f647f9cf613ebd1d65ff..593ffc10fc99d26b1ee9174ceef081581126e7e8 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -32,20 +32,15 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph); - -class ControlFlowGraph; class MemoryOptimizePass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - - private: // fill the variable map(var_nodes) by version. void InitSSAGraphNodes() const; + + private: // update program descs void RenameVarInGraphDesc(const std::string& var, const std::string& cache_var, size_t idx) const; @@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass { private: // Reuse Node Pool, Owned. - mutable OrderedNodeList pool_; + mutable OrderedSet pool_; // controlflow Graph mutable std::unique_ptr cfg_; // skip set @@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { mutable std::map> var_nodes_; }; -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // For IR Graph in parallelexecutor - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set LiveIn(ir::Node* op) const; - const std::set LiveOut(ir::Node* op) const; - const std::set Use(ir::Node* op) const; - const std::vector Ops() const; - std::vector& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - using NodeListMap = std::unordered_map>; - using VarSetMap = std::map>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - - std::vector ops_; // op sequence by topology sort -}; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_optimize_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc deleted file mode 100644 index 3d3dfa93594d496431f7cb60dceb26f20250fc16..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/memory_optimize_pass.h" -#include -#include -#include -#include "glog/logging.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/details/graph_test_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" - -REGISTER_OPERATOR(sum, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(assign, paddle::framework::DummyOp, - paddle::framework::AssignOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -/* - https://en.wikipedia.org/wiki/Live_variable_analysis - Create a customed classical dependency graph, left row is the instruction - number. - 1. a = 1 - 2. b = a - 3. c = a - 4. d = b + c - 5. e = d - - a--------+ - | | - b c - | | - d--------+ - | - e - Then analysis these variable's liveness range - */ - -namespace paddle { -namespace framework { -namespace details { - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -inline static ProgramDesc FillProgramDesc() { - ProgramDesc prog; - prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"b"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"c"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"d"}); - op->SetOutput("Out", {"e"}); - } - return prog; -} - -TEST(CFGGraph, IRGraph) { - // prepare ir graph - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - ControlFlowGraph cfg(graph); - cfg.LiveVariableAnalysis(); - - // test assign op - ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); - - // test assign op - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); - - // test sum op - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); - ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); - - // test assign op - ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); - ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); -} - -// 1. normal test -TEST(SortOpLikeDescOrder, NormalTest) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto nodes = SortOpLikeDescOrder(graph); - auto op_descs = prog.Block(0).AllOps(); - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 2. remove some op_desc -TEST(SortOpLikeDescOrder, RemoveOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - auto nodes = graph.Nodes(); - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->outputs.back()->Name() == "e") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - graph.RemoveNode(found_node); - graph.RemoveNode(e); - - // other node keeps the same order - auto remain_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < remain_nodes.size(); ++i) { - auto node = remain_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 3. add some op_desc -TEST(SortOpLikeDescOrder, AddOpDesc) { - auto prog = FillProgramDesc(); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // cached desc different with real one - // mimic the intermidiete pass modify the programdesc. - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto op_descs = prog.Block(0).AllOps(); - - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - op_descs.insert(op_descs.begin() + 4, op); - - auto nodes = SortOpLikeDescOrder(graph); - - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 4. add and delete some op_desc -TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // remove sum node - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - auto nodes = graph.Nodes(); - for (auto node : nodes) { - if (node->Name() == "sum") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* c = find_node_in_graph("c"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(c->outputs.begin(), c->outputs.end(), found_node); - ir::Node* pending_op = found_node->outputs[0]->outputs[0]; - graph.RemoveNode(e); - graph.RemoveNode(pending_op); - graph.RemoveNode(found_node); - } - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - op_descs.insert(op_descs.begin() + 2, op); - - // check the order - auto mynodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < mynodes.size(); ++i) { - auto node = mynodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 5. add and replace some op_desc inplace. -TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - auto op_descs = prog.Block(0).AllOps(); - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - - op_descs.emplace_back(op); - - // replace op_desc inplace - auto nodes = graph.Nodes(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->Op() && node->Name() == "assign") { - if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { - found_node = node; - break; - } - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(e->inputs.begin(), e->inputs.end(), found_node); - graph.RemoveNode(found_node); - } - op_descs.erase(op_descs.begin() + 3); - - auto replace_op = prog.MutableBlock(0)->AppendOp(); - replace_op->SetType("sum"); - replace_op->SetInput("X", {"d", "d1"}); - replace_op->SetOutput("Out", {"e"}); - { - ir::Node* sum2 = graph.CreateOpNode(replace_op); - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - ir::Node* d1 = find_node_in_graph("d1"); - sum2->inputs.emplace_back(d); - sum2->inputs.emplace_back(d1); - sum2->outputs.emplace_back(e); - e->inputs.emplace_back(sum2); - d->outputs.emplace_back(sum2); - d1->outputs.emplace_back(sum2); - } - - op_descs.emplace_back(replace_op); - // compare op order - auto graph_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < graph_nodes.size(); ++i) { - auto node = graph_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index cc2c8bfef9f9f54c2e499467df0d22ce3f69d6b8..879fb29d5926941e574d0080051c195293bc60a9 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index a04c08bc2eb3bae797d648b30a22a5fee7ba0eaa..ea3034877fcea80de0124df64d8d23028bdcb7b3 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -21,8 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - class SequentialExecutionPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index 03ab2a2b6c5dc07805fddddc3ac53f61e7b6a697..a3ccf677c90e8466f6c89041979336d45c1ac942 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { return in.Name() != out.Name() && details::NodeCanReused(in) && details::NodeCanReused(out) && - details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); + details::NodeSize(out) <= details::NodeSize(in); } }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f61c9e3a91146704faa6c5b1058137bef67d2a3e..ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,14 +171,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - if (build_strategy_.memory_early_delete_) { - auto early_delete_pass = - ir::PassRegistry::Instance().Get("memory_early_delete_pass"); - early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = early_delete_pass->Apply(std::move(graph)); - } - VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); + VLOG(10) << "Eager Deletion Threshold " + << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { for (size_t i = 0; i < graphs.size(); ++i) { graphs[i] = member_->PrepareGCAndRefCnts( @@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle -USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 953618560913229cd1e47659ad61e621efc10ed1..87f0f307d30bc90a43a698c3766b16c975f0635e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,11 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +DECLARE_bool(benchmark); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 46aca62a27490b0141245beeea9a3e61d0fee651..e983ae327d69389526ae4cae226b0eb324759700 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_bool(benchmark); namespace paddle { namespace memory { @@ -198,7 +199,7 @@ void *Alloc(const platform::CUDAPlace &place, << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - if (VLOG_IS_ON(3)) { + if (FLAGS_benchmark) { allocation::GPUMemMonitor.Add(place.device, size); } if (FLAGS_init_allocated_mem) { @@ -216,7 +217,7 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - if (VLOG_IS_ON(3)) { + if (FLAGS_benchmark) { allocation::GPUMemMonitor.Minus(place.device, size); } #else diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 655ce8485d4584aa0955315b045da6bf541f7fe2..60b2d83f15746eab0a4d29c7965c064690b6d46d 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,6 +14,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0493f60860ca45975be13a8c08f6bc52de107f18..351513712cc4297bf7fbe67878aeba162ef66e4d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1099,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle. "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) - .def_property( - "memory_early_delete", - [](const BuildStrategy &self) { return self.memory_early_delete_; }, - [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def_property( "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 52b260efd15066a114a8146106685043654c91ea..22212ae9a216acaab3f295f1f8d091829a0aa471 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,8 @@ class ParallelExecutor(object): else framework.default_main_program() # FIXME(dzhwinter): enable_inplace should be after memory_optimize # if turn on python memory optimize, turn off the inplace_pass. - build_strategy.enable_inplace = False if main._is_mem_optimized else True + if build_strategy.enable_inplace is None: + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from,