From 7cd24b13182bcdcbdb455a430d54d70172e73a59 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 18 Dec 2018 13:15:29 +0800 Subject: [PATCH] add ir memory optimize. (#14530) * follow comments. test=develop * Fix typo * fix compile error. test=develop * merge develop branch. test=develop * Remove set_equal * Polish code * Delete unused functions test=develop * polish code. test=develop * follow comment * polish code. * fix windows compile error. test=develop * fix op handle. * rerun ci. test=develop * rerun ci. test=develop * rerun macci. test=develop * polish code. test=develop * rewrite sort code. test=develop * remove unused code. test=develop * fix tests. test=develop * fix conflict. test=develop * follow comment. test=develop * merge develop branch. test=develop * fix tests. test=develop * remove ToTypeIndex. test=develop * rerun ci. test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 14 +- .../framework/details/analysis_var_pass.cc | 656 ++++++++++++++++++ .../framework/details/analysis_var_pass.h | 120 ++++ .../details/analysis_var_pass_test.cc | 470 +++++++++++++ .../fluid/framework/details/build_strategy.cc | 34 +- .../fluid/framework/details/build_strategy.h | 11 + .../details/early_delete_op_handle.h | 140 ++++ .../details/memory_early_delete_pass.cc | 117 ++++ .../details/memory_early_delete_pass.h | 32 + .../framework/details/memory_reuse_types.cc | 155 +++++ .../framework/details/memory_reuse_types.h | 87 +++ .../details/memory_reuse_types_test.cc | 99 +++ .../details/multi_devices_graph_print_pass.cc | 3 +- .../details/multi_devices_graph_print_pass.h | 5 +- .../fluid/framework/details/op_handle_base.h | 2 +- paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/framework/ir/graph_helper.cc | 3 +- paddle/fluid/framework/ir/graph_helper.h | 1 + paddle/fluid/framework/ir/node.cc | 8 + paddle/fluid/framework/ir/node.h | 5 +- paddle/fluid/framework/parallel_executor.cc | 19 +- paddle/fluid/framework/tensor_test.cc | 16 + paddle/fluid/pybind/pybind.cc | 8 + python/paddle/fluid/__init__.py | 2 +- .../unittests/parallel_executor_test_base.py | 2 + .../unittests/test_ir_memory_optimize_pass.py | 123 ++++ .../memory_optimization_transpiler.py | 61 +- 27 files changed, 2169 insertions(+), 29 deletions(-) create mode 100644 paddle/fluid/framework/details/analysis_var_pass.cc create mode 100644 paddle/fluid/framework/details/analysis_var_pass.h create mode 100644 paddle/fluid/framework/details/analysis_var_pass_test.cc create mode 100644 paddle/fluid/framework/details/early_delete_op_handle.h create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.cc create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types.cc create mode 100644 paddle/fluid/framework/details/memory_reuse_types.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 97f7713d9..63a68ba3a 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) - +cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) +cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) @@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass + memory_optimize_pass) diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc new file mode 100644 index 000000000..223b9da3c --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.cc @@ -0,0 +1,656 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +std::unique_ptr AnalysisVarPass::ApplyImpl( + std::unique_ptr graph) const { + auto nodes = graph->Nodes(); + auto subblock_vars = GetSubBlockVars(nodes); + skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.NodeMatch(var); + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + if (cache != nullptr) { + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." + << var->Name() << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + } + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + } + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + if (var_node == nullptr) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node, op); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + // For early delete pass. use GraphNodePool load the unlived vars. + // 1. find all deps op for each unlived var in memory pool. + for (auto& op : graph->Nodes()) { + for (auto& var : op->inputs) { + if (pool_.Has(var)) { + pool_.Insert(var, op); + } + } + } + // 2. convert ir node based memory pool to graph node + // because Node* maybe released bettwen passes. + auto& graph_pool = graph->Get(kGraphNodePool); + for (auto it = pool_.begin(); it != pool_.end(); ++it) { + std::unordered_set descs; + for (auto& op : it->second) { + PADDLE_ENFORCE(op->IsOp()); + descs.insert(op->Op()); + } + graph_pool.push_back(std::make_pair(it->first->Name(), descs)); + } + + return graph; +} + +void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.NodeMatch(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +std::unordered_set AnalysisVarPass::GetSubBlockVars( + const std::unordered_set& nodes) const { + std::unordered_set vars; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + if (OpHasSubBlock(op_desc)) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + vars.insert(inputs.begin(), inputs.end()); + vars.insert(outputs.begin(), outputs.end()); + } + } + return vars; +} + +void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void AnalysisVarPass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { + if (!node->IsVar() || node->IsCtrlVar()) return false; + auto* desc = node->Var(); + auto type = desc->GetType(); + if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || + desc->GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node->Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + if (skip_set_.count(name)) return false; + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + return framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return true; +} + +bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h new file mode 100644 index 000000000..144204bea --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.h @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); +// sort op in bfs order +std::vector BFSSortGraphOps(const ir::Graph& graph); + +class ControlFlowGraph; + +class AnalysisVarPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // valid a tensor can be reuse or not + bool NodeCanReused(ir::Node* node) const; + // scan subblock and collect the output/input variables. + std::unordered_set GetSubBlockVars( + const std::unordered_set&) const; + // check op has subblock or not + bool OpHasSubBlock(OpDesc* desc) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedNodePairPool pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // For IR Graph in parallelexecutor + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc new file mode 100644 index 000000000..9bc4fd33f --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +template +inline static std::string DebugString(const Container& c) { + std::stringstream ss; + for (auto& item : c) { + ss << item << " "; + } + return ss.str(); +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f2..779a9ed52 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -14,11 +14,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" +#include +#include + +#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { @@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } VLOG(1) << "CollectiveContext:" << context->String(); + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy.memory_optimize_) { + auto analysis_var_pass = AppendPass("analysis_var_pass"); + } // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy_.debug_graphviz_path_); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(kGraphvizPath, + new std::string(graph_path)); multi_devices_print_pass->Set( "graph_printer", new details::GraphvizSSAGraphPrinter); } @@ -127,7 +143,6 @@ std::unique_ptr BuildStrategy::Apply( CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); - for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); @@ -145,6 +160,17 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { + const std::vector *all_op_descs = + new std::vector(main_program.Block(0).AllOps()); + graph->Set>(kAllOpDescs, + all_op_descs); // take ownership + graph->Set(kGraphNodePool, + new GraphNodePool); // take ownership + + pass->Erase(kAllOpDescs); + pass->SetNotOwned>(kAllOpDescs, all_op_descs); + } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -166,6 +192,7 @@ std::unique_ptr BuildStrategy::Apply( } return graph; } + } // namespace details } // namespace framework } // namespace paddle @@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be1695..29396501d 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -60,8 +60,15 @@ struct BuildStrategy { kCustomized = 2, }; + enum class OptimizeStrategy { + // To be Implemented,bruteforce, recursive compute unused var names. + kBruteForce = 0, + kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. + }; + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; @@ -69,6 +76,10 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool memory_optimize_{false}; + + bool memory_early_delete_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h new file mode 100644 index 000000000..c8382d34b --- /dev/null +++ b/paddle/fluid/framework/details/early_delete_op_handle.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class EarlyDeleteOpHandle : public OpHandleBase { + public: + EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, + const platform::Place& place, + const std::vector& names, + GarbageCollector* gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + names_(names), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(place); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } +#endif + } + ~EarlyDeleteOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif + } + + std::string Name() const override { return "early_delete"; } + + protected: + void RunImpl() override { + std::vector> tensors; + auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + for (auto& var_name : names_) { + auto* var = local_scope->FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, + string::Sprintf("Local Scope not has var %s", var_name)); + if (var->IsType()) { + tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + tensors.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + LoDTensorArray* tensor_array = var->GetMutable(); + for (auto& tensor : *tensor_array) { + tensors.emplace_back(tensor.MoveMemoryHolder()); + } + } + } + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors( + const std::vector>& tensors) { + if (platform::is_cpu_place(place_)) { + ClearCPUTensors(tensors); + } else { + ClearGPUTensors(tensors); + } + } + + void ClearCPUTensors( + const std::vector>& tensors) { + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + gc->Add(tensors); + } + } + + void ClearGPUTensors( + const std::vector>& tensors) { +#ifdef PADDLE_WITH_CUDA + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast(gc_) != nullptr; +#endif + } + + const Scope* scope_; + const platform::Place place_; + std::vector names_; + GarbageCollector* gc_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext* dev_ctx_; + cudaEvent_t event_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc new file mode 100644 index 000000000..06a2451c1 --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_early_delete_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { + std::queue queue; + queue.push(var_in); + do { + auto* var = queue.front(); + queue.pop(); + for (auto* op : var->PendingOps()) { + auto* compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto* out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + +std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( + std::unique_ptr graph) const { + auto& graph_pool = Get(kGraphNodePool); + auto& gcs = Get(kGarbageCollector); + + std::unordered_map> unlived_vars; + unlived_vars.reserve(graph_pool.size()); + for (auto& pair : graph_pool) { + unlived_vars.insert(std::make_pair(pair.first, pair.second)); + } + + auto compare_and_insert_early_delete_op = [&]( + OpHandleBase* op, const std::vector& vars) { + if (unlived_vars.empty()) return; + // unlived vars can be deleted after the last used op has finished. + auto* compute_op = dynamic_cast(op); + const auto& places = Get>(kAllPlaces); + for (auto& var : vars) { + auto* var_handle = dynamic_cast(var); + auto var_name = var->Node()->Name(); + auto& var_place = var_handle->place_; + if (unlived_vars.count(var_name) == 0) continue; + if (!unlived_vars[var_name].empty()) { + if (compute_op != nullptr && + unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { + unlived_vars[var_name].erase(compute_op->Node()->Op()); + } + continue; + } + + if (var_handle == nullptr || !var_handle->Node()->IsVar() || + var_handle->Node()->IsCtrlVar()) + continue; + + // shameless copyed from reference count pass. + if (compute_op == nullptr) { + // use next computation op scope + compute_op = FindNextComputationOpHandle(var_handle); + } + auto* early_delete_node = + graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); + GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); + auto* early_delete_handle = new EarlyDeleteOpHandle( + early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); + if (compute_op->Outputs().empty()) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + early_delete_handle->AddInput(compute_op->Outputs().front()); + VLOG(5) << "Add early delete op " << var_name << " to Operator" + << compute_op->Name(); + } + }; + + auto all_ops = ir::FilterByNodeWrapper(*graph); + for (auto& op : all_ops) { + compare_and_insert_early_delete_op(op, op->Inputs()); + compare_and_insert_early_delete_op(op, op->Outputs()); + } + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_early_delete_pass, + paddle::framework::details::MemoryEarlyDeletePass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h new file mode 100644 index 000000000..8215aa1b2 --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/details/early_delete_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryEarlyDeletePass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc new file mode 100644 index 000000000..2b9ff518b --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +size_t NodeSizeInBytes(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + auto shape = desc->GetShape(); + size_t type_size = SizeOfType(desc->GetDataType()); + int size = 1; + for (auto& s : shape) { + size *= s; + } + return type_size * std::abs(size); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} +// return DebugString(var->Var()); } + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + } else { + return false; + } + } +}; + +void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + PADDLE_ENFORCE(op->IsOp()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->second.insert(op); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator compare_node; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto* cache_desc = FindVarDescInBlock(it->first); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (compare_node(it->first, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = + nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + mark_table_[var->Name()] = it; +} + +int OrderedNodePairPool::GetIndex(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator compare_node; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + if (compare_node(var, it->first)) { + found_node = it->first; + break; + } + } + return found_node; +} + +void OrderedNodePairPool::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedNodePairPool::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + ss << DebugString(it->first) << " "; + } + return ss.str(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h new file mode 100644 index 000000000..9a9c1d948 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kFetchedVars[] = "fetched_vars"; +constexpr char kGraphNodePool[] = "graph_node_pool"; + +// NOTE(dzh): Variable and the operators use the var. +// for early delete pass. +// Because analysis var pass build base on ir::Node, which maybe released +// or modified between passes, so we use OpDesc* to mark ops. +using GraphNodePool = std::vector< + std::pair /* ops */>>; + +// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size is determined in runtime. +// the node batch_size equal -1 always ranking in the front than the node not. +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. +// O(1) insert, delete +class OrderedNodePairPool { + public: + using NodePair = std::pair>; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var, ir::Node* op); + + void Erase(ir::Node* var); + + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + + ir::Node* NodeMatch(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetIndex(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node swap pairs. var -> ops dep var + std::list nodes_; +}; + +// node memory size in bytes +size_t NodeSizeInBytes(ir::Node* n); + +std::string DebugString(ir::Node* var); + +// std::string DebugString(VarDesc* var); +VarDesc* FindVarDescInBlock(ir::Node* n); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc new file mode 100644 index 000000000..d2fabf5ce --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(OrderedNodePairPool, Normal) { + OrderedNodePairPool pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get(), op.get()); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 8f92f0948..c20307384 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace paddle REGISTER_PASS(multi_devices_print_pass, - paddle::framework::details::SSAGraghBuilderWithPrinter); + paddle::framework::details::SSAGraghBuilderWithPrinter) + .RequirePassAttr(paddle::framework::details::kGraphvizPath); diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index c00685fa1..b06c87a5c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} @@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass { std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); return graph; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c6..b1a82e877 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { namespace details { -constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@"; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe..8670dcfed 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -162,7 +162,10 @@ void Graph::ResolveHazard( (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; - PADDLE_ENFORCE(write_op, "The write_op should not be empty."); + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); // Add write after write dependence ir::Node *upstream_op = diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d2d28793c..d99f856d8 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include DEFINE_string(print_sub_graph_dir, "", @@ -121,7 +122,7 @@ std::map> BuildOperationAdjList( } size_t GraphNum(const Graph &graph) { - std::unordered_set nodes = graph.Nodes(); + std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; visited_nodes.reserve(nodes.size()); std::deque q_nodes; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 8d92c4066..be525151f 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + // Test if the graph contains circle. bool HasCircle(const Graph &graph); diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index eac67108e..45d81b937 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,6 +30,14 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f..89dcc677b 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -125,6 +124,8 @@ class Node { friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} @@ -152,7 +153,9 @@ class Node { std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index eb4baa06b..7e3fe02ea 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include #include #include @@ -93,6 +94,7 @@ class ParallelExecutorPrivate { } } + BuildStrategy build_strategy_; std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -169,6 +171,14 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + if (build_strategy_.memory_early_delete_) { + auto early_delete_pass = + ir::PassRegistry::Instance().Get("memory_early_delete_pass"); + early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = early_delete_pass->Apply(std::move(graph)); + } + VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor( build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif - auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { graph = member_->PrepareGCAndRefCnts(std::move(graph), @@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor( if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a0a9a5736..83dea8639 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -74,6 +74,22 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); } // Not sure if it's desired, but currently, Tensor type can be changed. { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 017598e17..737ae2dd9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) { return self.memory_optimize_; }, + [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "memory_early_delete", + [](const BuildStrategy &self) { return self.memory_early_delete_; }, + [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2dea71d7a..b00510d44 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -150,7 +150,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674..e2a9fc183 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, + use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, @@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py new file mode 100644 index 000000000..6ca65c5d3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def _feed_data_helper(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + return img, label + + +def simple_fc_net(use_feed): + x, y = _feed_data_helper(use_feed) + hidden_layer = 4 + for _ in range(hidden_layer): + x = fluid.layers.fc(input=x, size=20, act='relu') + y_predict = fluid.layers.fc(input=x, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +def fc_with_inplace_net(use_feed): + x, y = _feed_data_helper(use_feed) + fc = fluid.layers.fc(input=x, size=20, act='relu') + fc = fluid.layers.fc(input=fc, size=10, act='relu') + reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5]) + reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _dummy_data(self): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_ir_and_python_memory_optimize(self, model, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._dummy_data() + first_loss0, last_loss0 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=False) + first_loss1, last_loss1 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=True) + for loss in zip(first_loss0, first_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + for loss in zip(last_loss0, last_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_net(self): + self._compare_ir_and_python_memory_optimize(simple_fc_net, False) + self._compare_ir_and_python_memory_optimize(simple_fc_net, True) + + def test_fc_with_reshape_net(self): + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False) + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 5a7d04ed1..7b530ba61 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -43,6 +43,7 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False +FLAGS_memory_optimize = "" class OrderedSet(MutableSet): @@ -121,6 +122,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(OrderedSet) self._live_in = defaultdict(OrderedSet) self._live_out = defaultdict(OrderedSet) + self._skip_opt = skip_opt self.pool = [] @@ -144,7 +146,6 @@ class ControlFlowGraph(object): for i in range(self.op_size): self._uses[i].update(self._ops[i].input_arg_names()) self._defs[i].update(self._ops[i].output_arg_names()) - self._live_in[i] = self._uses[i] def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -177,20 +178,52 @@ class ControlFlowGraph(object): worklist.append(d) def _fill_pool(self, i, is_forward): + def comparator(x, cache): + x_shape = x[1] + cache_shape = cache[1] + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if (x_shape[0] == -1 and cache_shape[0] == -1) or \ + (x_shape[0] != -1 and cache_shape[0] != -1) : + return x_size <= cache_size + else: + return False + + def find_var_in_block(x): + known_vars = set() + for op in self._ops: + known_vars.update(op.output_arg_names()) + return x in known_vars + block_desc = self._ops[i].block() in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in in_diff + x for x in sorted(in_diff) if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: for var_name in can_optimize: cache = (var_name, self._find_var(block_desc, var_name, is_forward).shape()) - if cache not in self.pool: - self.pool.append(cache) + if cache not in self.pool and find_var_in_block(var_name): + i = 0 + while i < len(self.pool): + mycache = self.pool[i] + mysize = mycache[1][0] + cache_size = cache[1][0] + if (mysize == -1 and cache_size == -1) or \ + (mysize != -1 and cache_size != -1): + if comparator(mycache, cache): + i += 1 + else: + break + elif mysize == -1 and cache_size != -1: + i += 1 + elif mysize != -1 and cache_size == -1: + break + self.pool.insert(i, cache) def _get_diff(self, a, b): u = a & b @@ -229,7 +262,7 @@ class ControlFlowGraph(object): def _update_skip_opt_set(self): for i in range(self.op_size): op = self._ops[i] - if op.type() == "fill_constant" and op.attr("force_cpu") == True: + if op.has_attr("force_cpu") and op.attr("force_cpu") == True: self._skip_opt.update(op.output_arg_names()) def release_memory(self, skip_opt_set=None): @@ -281,6 +314,7 @@ class ControlFlowGraph(object): # update skip set to meet users' demand if skip_opt_set: self._skip_opt.update(skip_opt_set) + counter = 0 for i in range(self.op_size): op = self._ops[i] if op.type() in SUB_BLOCK_OPS: @@ -301,6 +335,9 @@ class ControlFlowGraph(object): # If x is both in uses and defs, it can not be optimized! if x in self._uses[i]: continue + if x == FLAGS_memory_optimize: + print("start match var ", x, " of op ", op.type()) + print(self.pool) for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] @@ -323,15 +360,13 @@ class ControlFlowGraph(object): if not compare_shape(x_shape, cache_shape, level): continue # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype] - if x_dtype != cache_dtype: - continue - if PRINT_LOG: - print(("Hit Cache !!!! cache pool index " - "is %d, var name is %s, " - "cached var name is %s, " - "var shape is %s ") % (index, x, cache_var, - str(cache_shape))) + print( + ("!!! %d, %s => %s, cache idx %d, pool size %d" + % (counter, x + str(x_shape), + cache_var + str(cache_shape), index, + len(self.pool)))) + counter += 1 self.pool.pop(index) # Rename the var to the cache var already with # memory allocated in order to reuse the memory. -- GitLab