diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index e88084424baf7eb5cd1d67ea19966866d71ec3eb..5e8ffa4f51d0f73d3d04d4287567779a12522b8c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +if(WITH_GPU) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) +else() +nv_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) +endif() + cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index 6345ba335997ec42ebc63f90e9bf6a3ed2648edc..ef2b4131bf94eac0f11e136d1daad461b7a94962 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,13 +13,19 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include #include -#include +#include #include #include #include #include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif // PADDLE_WITH_CUDA namespace paddle { namespace framework { @@ -230,6 +236,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { return found_node; } +ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + auto it = + std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { + if (v.front() == prev) + return true; + else + return false; + }); + PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); + for (it = std::next(it); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + bool OrderedSet::Has(ir::Node* var) const { if (mark_table_.count(var->Name())) { auto& node_in_samename = mark_table_.at(var->Name()); @@ -274,14 +301,35 @@ bool NodeCanReused(ir::Node* node) { return flag; } +int MinChunkSize() { + int size{0}; +#ifdef PADDLE_WITH_CUDA + size = platform::GpuMinChunkSize(); +#else + size = platform::CpuMinChunkSize(); +#endif // PADDLE_WITH_CUDA + return size; +} + bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); + // only these types holds bulk of gpu memory if (!(type == proto::VarType::LOD_TENSOR || type == proto::VarType::SELECTED_ROWS || type == proto::VarType::LOD_TENSOR_ARRAY)) { return false; } - if (node.Persistable() || node.GetShape().empty()) { + // persistable variable is parameter + if (node.Persistable()) { + return false; + } + // shape < min_chunk_size is meaningless. + // further more, fetched loss always has size = 1 + // which should not be reused. + auto shape = node.GetShape(); + int size = std::abs( + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); + if (shape.empty() || size < MinChunkSize()) { return false; } // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 0bfaf827fea84030de48a9984197f5b39f5c9261..e17030b2ab9bb6dd01bbd789b4e06719318214d7 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -62,6 +62,7 @@ class OrderedSet { } // find the bestfit shape node block with var. ir::Node* FindBestFitNode(ir::Node* var) const; + ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; // map store non-const iterator, can not promise const int GetNodeIndexInPool(ir::Node* var); // pool all node to string diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 1574d7844088f2bc73d68d497ffa3dac05ee9fef..2f9e2e662b1ef954b33940ca580faca9bf966ef2 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -76,6 +76,13 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { ir::Node* cache = pool_.FindBestFitNode(var); + while (cache != nullptr && var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." << var->Name() + << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + cache = pool_.FindNextBestFitNode(var, cache); + } if (var->Name() == FLAGS_memory_optimize_debug) { VLOG(3) << "start match var " << DebugString(var) << " of op " << op->Name(); @@ -85,14 +92,6 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } if (cache != nullptr) { - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." - << var->Name() << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - } - int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); VLOG(3) << string::Sprintf( "!!! %s, %s => %s, cache idx %d, pool size %d",