diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b9491c953f8c9b69930823a81177c1aebe5e68f8..ad19d729ebde4a9c81c283518f3cb2ac28152443 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -174,7 +174,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector) +target_link_libraries(executor garbage_collector while_op_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index dc308fd2592bb158f46f6eac9dd0df25787559fe..9f06455ea5410bcab081ed212a34960f8fe6f0bf 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1e3dbb1e44ecb16872e3bf4dee31e31cc69c9818..e98b16e6b3a07bfa0994295306e3bfa9e4174834 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include @@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase { ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); + OperatorBase *GetOp() { return op_.get(); } + std::string Name() const override; const Scope *GetScope() const { return scope_; } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 03fbfd7f24a8a987db72f45be777acc7ece577a6..dbc90737f2286db6e74d3271f39d004c25e4a949 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" @@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } } #endif + PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty"); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { @@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - // Var not found, not reference count has not decreased to 0 + // Reference count has not decreased to 0 if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } + if (!exec_scope) { + exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + // Var not found auto *var = exec_scope->FindVar(name); if (var == nullptr) { continue; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 4e42d0b4972d567dd769cad6ff8b9d45380ab77a..377bb915e0ce175d4e3fb74cb1ace21e5f46d9d8 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -12,20 +12,173 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" -#include "paddle/fluid/framework/details/eager_deletion_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + namespace paddle { namespace framework { namespace details { +// op -> variables which can be deleted after op runs +using OpToVarNameSetMap = + std::unordered_map>; + +// Check whether the variable is LoDTensor based on static VarDesc info +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto &var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + ComputationOpHandle *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 + if (fraction_of_memory_size <= 0.0) return {}; + + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 + if (fraction_of_memory_size >= 1.0) { + return delete_lod_tensor_only ? lod_tensors : m; + } + + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ + + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; + + // place -> total memory sizes + std::map place_to_size; + for (auto &op_vars_pair : lod_tensors) { + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); + } + } + + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); + }); + + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; + ++i) { + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); + } + } + + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); + } + } + + return partial_vars; +} + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -43,9 +196,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( // a reverse map of last_live_ops // i.e., last op --> variable names which can be deleted. - std::unordered_map> - op_vars_map; - + OpToVarNameSetMap op_vars_map; for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; @@ -55,6 +206,9 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, + FLAGS_memory_fraction_of_eager_deletion); + for (auto &pair : op_vars_map) { auto *op = pair.first; auto &var_names = pair.second; @@ -85,8 +239,13 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " + << FLAGS_memory_fraction_of_eager_deletion; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; - return graph; + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + return while_op_eager_deletion_pass->Apply(std::move(graph)); } } // namespace details @@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass, .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h deleted file mode 100644 index d7a7a9709d970841060778806451bc21cb2c7571..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/eager_deletion_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -class EagerDeletionPass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 13a042d8e6ed7f18c76387b666d681df0eabd0b5..6092143449bc8e20117e7021bd44553cf64ae5b5 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } -static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { - VarDesc *var_desc = nullptr; - std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); - return var_desc; -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = Get>(kGlobalReferenceCount); diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc index 89bd08c2d041d795205b29bb29aba311d1dbd932..94de0e6ab0a91d90a7f2c4c4fc14eb78663c95fe 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.cc +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -13,9 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { -namespace details {} // namespace details +namespace details { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 1c083dbf001b08e40a54cc89b21c3dea1f18f16a..ce700119c54ddd711315dfa45d61b9241cfda651 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,10 @@ namespace paddle { namespace framework { + +class VarDesc; +class VarHandle; + namespace details { class ComputationOpHandle; @@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = - std::unordered_map>; + std::unordered_map>; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd6b6dd2274d9721b8754e16cd7b4f1ab596380d --- /dev/null +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + return graph; + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::details::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c31d0beec306fe165164837cd15c95b4efd76af0..f3869ceb6d355914107052ae046559195cc969cb 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include +#include +#include +#include +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -23,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -75,11 +80,11 @@ static std::unordered_map GetNonPersistableReferenceCounts( ExecutorPrepareContext::ExecutorPrepareContext( const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars) - : prog_(prog), block_id_(block_id) { - if (GetEagerDeletionThreshold() >= 0) { - global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + const std::vector& keep_vars, bool force_disable_gc) + : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) { + if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) { + global_ref_cnts_ = + GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars); } } @@ -184,13 +189,15 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); #ifdef PADDLE_WITH_NGRAPH if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); #endif - auto ctx = Prepare(pdesc, block_id); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -357,9 +364,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id, - const std::vector& skip_ref_cnt_vars) { - std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + std::unique_ptr ctx(new ExecutorPrepareContext( + program, block_id, skip_ref_cnt_vars, force_disable_gc)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -370,7 +377,8 @@ std::unique_ptr Executor::Prepare( std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids, - const std::vector>& skip_ref_cnt_vars) { + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { PADDLE_ENFORCE( skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), "skip_ref_cnt_vars should be either empty or equals to block number %d", @@ -380,9 +388,11 @@ std::vector> Executor::Prepare( for (auto& bid : block_ids) { ExecutorPrepareContext* ctx; if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid); + ctx = new ExecutorPrepareContext(program, bid, std::vector(), + force_disable_gc); } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx], + force_disable_gc); } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); @@ -409,8 +419,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // skip while_op and while_grad_op temporarily - if (max_memory_size >= 0 && !keep_kids) { + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -428,6 +439,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + } } for (auto& op : ctx->ops_) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 5a040ac641588ad4d89d1f6e4c0d6c296eff38eb..65cb9e51ab2c9208b6bfbbed54f4136ffbd627ff 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" @@ -30,7 +32,8 @@ namespace framework { struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); ~ExecutorPrepareContext(); @@ -38,6 +41,7 @@ struct ExecutorPrepareContext { const framework::ProgramDesc& prog_; size_t block_id_; + bool force_disable_gc_; std::vector> ops_; std::unordered_map global_ref_cnts_; @@ -66,7 +70,10 @@ class Executor { * Scope */ void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, @@ -79,12 +86,14 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids, const std::vector>& skip_ref_cnt_vars = - std::vector>()); + std::vector>(), + bool force_disable_gc = false); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b614e9b03502634a29333f331e25201a0f77ba38..7aa1c44eaafe53034b19ee52c59cc94d3a1269da 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) register_operators(DEPS naive_executor) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 0360cf5273591946570cac47e2578e43f498b550..8352ba4f2b846af58d2d041ebf5201ee15f8481c 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { @@ -26,14 +27,6 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - namespace { // NOLINT static std::string GetSkipEagerDeletionVarsDebugString( const std::vector &vars) { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cbd94a061b5b369d67b6e0995d6b8fd45801828 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,291 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +static std::string GetDebugString(const std::vector &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..456ba8642b9bd32a1236d112cc8b387ae6a279d3 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 88c968a0eaae8a2ac6f14ede9348c837bcd92d76..2898a62ddbac524ceb212cac5f34aeda3b1e01cb 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -282,7 +282,9 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -398,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase { VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cf59ff6d3b97a4be5d87f1185acc6173b5d501b2..439d9aa83ffb13489e254b3f9be8d7fb2976c2f9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -876,9 +876,11 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init()) .def("close", &Executor::Close) .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, - int block_id, bool create_local_scope, bool create_vars) { + int block_id, bool create_local_scope, bool create_vars, + const std::vector &fetch_vars) { pybind11::gil_scoped_release release; - self.Run(prog, scope, block_id, create_local_scope, create_vars); + self.Run(prog, scope, block_id, create_local_scope, create_vars, + fetch_vars); }); m.def("init_gflags", framework::InitGflags); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8102732c55be2e9922875a7f7b29d68aba1f4900..103c4d3dd067abfc5e7ab75c92aa124f39b1972e 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -128,11 +128,11 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', - 'fast_eager_deletion_mode', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir', - 'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism', - 'enable_parallel_graph', 'multiple_of_cupti_buffer_size', - 'enable_subgraph_optimize' + 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', + 'inner_op_parallelism', 'enable_parallel_graph', + 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index dfa50e721c979703165649dccfd6e42ef08e97b7..cc3c0dd6899b4f190d0ad442940c7f90fd9118aa 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -590,7 +590,7 @@ class Executor(object): fetch_var_name=fetch_var_name) self._feed_data(program, feed, feed_var_name, scope) - exe.run(program.desc, scope, 0, True, True) + exe.run(program.desc, scope, 0, True, True, fetch_var_name) outs = self._fetch_data(fetch_list, fetch_var_name, scope) if return_numpy: outs = as_numpy(outs) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 603c8e74885d2a050e6e1e3101dce880b6eabe9c..05cc41b96f1992718c21eb5d7d2605dd8d3b2218 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,8 +16,7 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -os.environ[ - 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio' from test_parallel_executor_transformer import TestTransformer diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py new file mode 100644 index 0000000000000000000000000000000000000000..898d04ebe1c9c2c3a336aeca07ab6ce79a890e0a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -0,0 +1,153 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +os.environ['CPU_NUM'] = '2' +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['FLAGS_fast_eager_deletion_mode'] = '1' + +import unittest +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +import paddle.fluid.core as core +from paddle.fluid.backward import append_backward +import paddle.fluid.compiler as compiler +import numpy +import multiprocessing + + +class TestEagerDeletionWhileOpBase(unittest.TestCase): + def test_main(self): + places = [core.CPUPlace(), ] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for p in places: + for with_data_parallel in [False, True]: + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.Scope()): + self.run_main(p, with_data_parallel) + + def run_main(self, place, with_data_parallel): + self.place = place + self.with_data_parallel = with_data_parallel + + if not core.is_compiled_with_cuda() and isinstance(self.place, + core.CUDAPlace): + return + + if isinstance(self.place, core.CUDAPlace): + device_cnt = core.get_cuda_device_count( + ) if self.with_data_parallel else 1 + else: + device_cnt = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count( + ))) if self.with_data_parallel else 1 + + d0 = layers.data( + "d0", shape=[10], append_batch_size=False, dtype='float32') + d1 = layers.data( + "d1", shape=[10], append_batch_size=False, dtype='float32') + d2 = layers.data( + "d2", shape=[10], append_batch_size=False, dtype='float32') + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + init = layers.zeros(shape=[10], dtype='float32') + mem_array = layers.array_write(x=init, i=i) + data_array = layers.array_write(x=d0, i=i) + + i = layers.increment(i) + layers.array_write(d1, i, array=data_array) + + i = layers.increment(i) + layers.array_write(d2, i, array=data_array) + + i = layers.zeros(shape=[1], dtype='int64') + i.stop_gradient = True + + array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) + array_len.stop_gradient = True + cond = layers.less_than(x=i, y=array_len) + + j = layers.fill_constant(shape=[1], dtype='int64', value=1) + j.stop_gradient = True + + array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len2.stop_gradient = True + cond2 = layers.less_than(x=j, y=array_len2) + + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + prev = layers.array_read(array=mem_array, i=i) + d = layers.reshape(d, shape=[10]) + prev = layers.reshape(prev, shape=[10]) + result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=mem_array) + layers.less_than(x=i, y=array_len, cond=cond) + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + d2 = layers.reshape(d2, shape=[10]) + prev2 = layers.reshape(prev2, shape=[10]) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + + sum_result = layers.array_read(array=mem_array, i=j) + sum_result.persistable = True + tmp = layers.unsqueeze(sum_result, axes=[0]) + tmp = layers.expand(tmp, expand_times=[10, 1]) + fc = layers.fc(tmp, size=256) + loss = layers.mean(sum_result) + + optim = fluid.optimizer.Adam(learning_rate=1e-3) + optim.minimize(loss) + + exe = Executor(self.place) + exe.run(fluid.default_startup_program()) + + prog = compiler.CompiledProgram(fluid.default_main_program()) + if self.with_data_parallel: + prog = prog.with_data_parallel() + + for _ in range(5): + d = [] + for i in range(3): + tmp = numpy.random.random(size=[10]).astype('float32') + if not self.with_data_parallel: + d.append(tmp) + else: + d.append(numpy.array([tmp] * device_cnt)) + + outs = exe.run(program=prog, + feed={'d0': d[0], + 'd1': d[1], + 'd2': d[2]}, + fetch_list=[sum_result]) + self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..7607189454b2264523176b6853fd9debddf47eed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" + +os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio' + +from test_parallel_executor_transformer import TestTransformer + +if __name__ == '__main__': + unittest.main()