diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ad19d729ebde4a9c81c283518f3cb2ac28152443..4d54754cec00dc435000138d4f297af243813fc3 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) -cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -164,6 +164,8 @@ else() set(NGRAPH_EXE_DEPS) endif() +cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) @@ -174,7 +176,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector while_op_helper) +target_link_libraries(executor while_op_helper executor_gc_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor @@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) + cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 377bb915e0ce175d4e3fb74cb1ace21e5f46d9d8..a6baa26134cf36ea93dde554f808e73fa0c30b93 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -22,14 +22,9 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/ir/graph_helper.h" -DEFINE_double(memory_fraction_of_eager_deletion, 1.0, - "Fraction of eager deletion. If less than 1.0, all variables in " - "the program would be sorted according to its memory size, and " - "only the FLAGS_memory_fraction_of_eager_deletion of the largest " - "variables would be deleted."); - namespace paddle { namespace framework { namespace details { @@ -206,8 +201,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( } } - op_vars_map = ShrinkGCVars(op_vars_map, vars, places, - FLAGS_memory_fraction_of_eager_deletion); + double memory_fraction = framework::GetEagerDeletionMemoryFraction(); + + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction); for (auto &pair : op_vars_map) { auto *op = pair.first; @@ -239,8 +235,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } - VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " - << FLAGS_memory_fraction_of_eager_deletion; + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; auto while_op_eager_deletion_pass = diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h deleted file mode 100644 index c8382d34b790ba7c95415acdf0b55dc97a9cd265..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/early_delete_op_handle.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include <string> -#include <vector> -#include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -class EarlyDeleteOpHandle : public OpHandleBase { - public: - EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, - const platform::Place& place, - const std::vector<std::string>& names, - GarbageCollector* gc) - : OpHandleBase(node), - scope_(scope), - place_(place), - names_(names), - gc_(gc) { -#ifdef PADDLE_WITH_CUDA - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get<platform::CUDAPlace>(place); - PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } -#endif - } - ~EarlyDeleteOpHandle() { -#ifdef PADDLE_WITH_CUDA - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } -#endif - } - - std::string Name() const override { return "early_delete"; } - - protected: - void RunImpl() override { - std::vector<std::shared_ptr<memory::Allocation>> tensors; - auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>(); - for (auto& var_name : names_) { - auto* var = local_scope->FindVar(var_name); - PADDLE_ENFORCE(var != nullptr, - string::Sprintf("Local Scope not has var %s", var_name)); - if (var->IsType<LoDTensor>()) { - tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder()); - } else if (var->IsType<SelectedRows>()) { - tensors.emplace_back(var->GetMutable<SelectedRows>() - ->mutable_value() - ->MoveMemoryHolder()); - } else if (var->IsType<LoDTensorArray>()) { - LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>(); - for (auto& tensor : *tensor_array) { - tensors.emplace_back(tensor.MoveMemoryHolder()); - } - } - } - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors( - const std::vector<std::shared_ptr<memory::Allocation>>& tensors) { - if (platform::is_cpu_place(place_)) { - ClearCPUTensors(tensors); - } else { - ClearGPUTensors(tensors); - } - } - - void ClearCPUTensors( - const std::vector<std::shared_ptr<memory::Allocation>>& tensors) { - auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_); - if (gc != nullptr) { - gc->Add(tensors); - } - } - - void ClearGPUTensors( - const std::vector<std::shared_ptr<memory::Allocation>>& tensors) { -#ifdef PADDLE_WITH_CUDA - auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr; -#endif - } - - const Scope* scope_; - const platform::Place place_; - std::vector<std::string> names_; - GarbageCollector* gc_; -#ifdef PADDLE_WITH_CUDA - platform::CUDADeviceContext* dev_ctx_; - cudaEvent_t event_; -#endif -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 8981ec803c95c14cacd897b1ef8a419c3030cd3b..e5b58ec68761469a03929435d1a73bf0a2d1660e 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -21,6 +21,7 @@ limitations under the License. */ #include <vector> #include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/inplace_op_inference.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -36,27 +37,86 @@ enum OpInfoFillType { kGradOpDescMaker = 2, kVarTypeInference = 3, kShapeInference = 4, - kInplaceOpInference = 5 + kInplaceOpInference = 5, + kNoNeedBufferVarsInference = 6, + kUnknown = -1 }; +namespace internal { +template <typename T, OpInfoFillType kType> +struct TypePair { + using Type = T; + static constexpr OpInfoFillType kFillType = kType; +}; + +using OpRegistryClasses = std::tuple< // NOLINT + TypePair<OperatorBase, kOperator>, // NOLINT + TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>, // NOLINT + TypePair<GradOpDescMakerBase, kGradOpDescMaker>, // NOLINT + TypePair<VarTypeInference, kVarTypeInference>, // NOLINT + TypePair<InferShapeBase, kShapeInference>, // NOLINT + TypePair<InplaceOpInference, kInplaceOpInference>, // NOLINT + TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference> // NOLINT + >; + +static constexpr int kOpRegistryClassNumber = + std::tuple_size<OpRegistryClasses>::value; + +template <typename T, int kPos, bool kIsBounded /* = true*/> +struct IsMatchedBaseTypeImpl { + using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type; + static constexpr bool kValue = + std::is_base_of<typename PairType::Type, T>::value; +}; + +template <typename T, int kPos> +struct IsMatchedBaseTypeImpl<T, kPos, false> { + static constexpr bool kValue = false; +}; + +template <typename T, int kPos> +static inline constexpr bool IsMatchedBaseType() { + return IsMatchedBaseTypeImpl< + T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue; +} + +template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched> +struct OpInfoFillTypeGetterImpl {}; + +// This case should not happen +template <typename T, int kStart, int kEnd> +struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {}; + +template <typename T, int kStart, int kEnd> +struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> { + static constexpr OpInfoFillType kType = kUnknown; +}; + +template <typename T, int kStart, int kEnd> +struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> { + static constexpr OpInfoFillType kType = + OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd, + IsMatchedBaseType<T, kStart + 1>()>::kType; +}; + +template <typename T, int kStart, int kEnd> +struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> { + using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type; + static constexpr OpInfoFillType kType = PairType::kFillType; +}; + +template <typename T> +using OpInfoFillTypeGetter = + OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber, + kOpRegistryClassNumber == 0, + IsMatchedBaseType<T, 0>()>; + +} // namespace internal + template <typename T> struct OpInfoFillTypeID { static constexpr OpInfoFillType ID() { - return std::is_base_of<OperatorBase, T>::value - ? kOperator - : (std::is_base_of<OpProtoAndCheckerMaker, T>::value - ? kOpProtoAndCheckerMaker - : (std::is_base_of<GradOpDescMakerBase, T>::value - ? kGradOpDescMaker - : (std::is_base_of<VarTypeInference, T>::value - ? kVarTypeInference - : (std::is_base_of<InferShapeBase, T>::value - ? kShapeInference - : (std::is_base_of< - InplaceOpInference, T>::value - ? kInplaceOpInference - : static_cast<OpInfoFillType>( - -1)))))); + return internal::OpInfoFillTypeGetter<T>::kType; } }; @@ -156,6 +216,18 @@ struct OpInfoFiller<T, kInplaceOpInference> { } }; +template <typename T> +struct OpInfoFiller<T, kNoNeedBufferVarsInference> { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + T infer(inputs, outputs, attrs); + return infer(); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 6092143449bc8e20117e7021bd44553cf64ae5b5..0c3d8d5caec0015c5696223db2e4b75a8d79e5e1 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -193,6 +193,79 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } +/** + * Shrink op dependencies according to no need buffer vars. + * + * If some ops do not need Tensor buffer of any input, + * just remove the dependency of this op, i.e, decrease reference count. + * + * For example, input Y of elementwise_add_grad op is only used to infer shape + * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of + * input Y can be collected before elementwise_add_grad op runs. + * + * This method returns whether the dependency count decreases to 0, and + * shrinks op dependency if possible. + */ +static bool ShrinkNoNeedBufferVarOpDependency( + const std::string &var_name, + std::unordered_set<ComputationOpHandle *> *op_handles) { + std::vector<ComputationOpHandle *> skip_ops; + for (auto *op_handle : *op_handles) { + auto *op_base = op_handle->GetOp(); + auto &inferer = op_base->Info().NoNeedBufferVarsInferer(); + if (!inferer) { + continue; + } + + std::unordered_set<std::string> no_need_buffer_vars = + inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs()); + + // Check whether var_name occurs in other inputs or outputs of the op + // If it occurs, we cannot decrease the dependency number. + bool occurred_in_other_vars = false; + for (auto &in_pair : op_base->Inputs()) { + if (no_need_buffer_vars.count(in_pair.first) > 0) { + continue; + } + + auto &args = in_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (occurred_in_other_vars) { + continue; + } + + for (auto &out_pair : op_base->Outputs()) { + auto &args = out_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (!occurred_in_other_vars) { + VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name(); + skip_ops.emplace_back(op_handle); + } + } + + if (skip_ops.size() == op_handles->size()) { + op_handles->clear(); + return true; + } else { + for (auto *skip_op : skip_ops) { + op_handles->erase(skip_op); + } + return false; + } +} + std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( std::unique_ptr<ir::Graph> graph) const { auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount); @@ -229,17 +302,43 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( continue; } - bool ok; - auto result = ExtractComputationOpFromLastLivedVar( - name_var_pair.second.back(), i, shrink_func, &ok); + auto &var_name = name_var_pair.first; + auto &var_handles = name_var_pair.second; + + for (auto iter = var_handles.rbegin(); iter != var_handles.rend(); + ++iter) { + bool ok; + auto result = + ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok); + + // Seldomly, some vars may have no pending or preceding computation ops + // Just break; + if (!ok) break; + VLOG(10) << "Extract " << result.size() << " ops of var " << var_name; + + size_t original_op_deps = result.size(); + // If all ops do not need buffer of var_name, calculate reference count + // of the previous version of var_name. + if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) { + VLOG(10) << "Try to precede reference count computing at var " + << var_name; + continue; + } + + size_t final_op_deps = result.size(); + if (final_op_deps < original_op_deps) { + VLOG(5) << "Shrink op deps from " << original_op_deps << " to " + << final_op_deps; + } - if (ok) { - auto &var_name = name_var_pair.first; PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", var_name); ref_cnts[i].emplace(var_name, result.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(result)); } + + // Seldomly, all preceding trying failed. + // Just skip this corner case } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 99192292b0be992d5ff0ecebba6294b9ba27e958..0d4334f193dcb067a49f5e67b69d21531c7048bd 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include <unordered_set> #include <utility> +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -48,97 +49,23 @@ namespace { int kProgramId = -1; } // namespace -static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts( - const BlockDesc& block, const std::vector<std::string>& skip_var_list) { - std::unordered_map<std::string, size_t> ref_cnts; - std::unordered_set<std::string> skip_vars(skip_var_list.begin(), - skip_var_list.end()); - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - if (skip_vars.count(name)) continue; - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS && - type != proto::VarType::LOD_TENSOR_ARRAY) { - continue; - } - ++ref_cnts[name]; - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id, - const std::vector<std::string>& keep_vars, bool force_disable_gc) - : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) { - if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) { - global_ref_cnts_ = - GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars); + const framework::ProgramDesc& prog, size_t block_id) + : prog_(prog), block_id_(block_id) {} + +void ExecutorPrepareContext::PrepareUnusedVars( + const std::vector<std::string>& keep_vars, bool force_disable_gc) { + force_disable_gc_ = force_disable_gc; + if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) { + return; } + unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars); } ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -static void DeleteUnusedTensors( - const Scope& scope, const OperatorBase* op, GarbageCollector* gc, - std::unordered_map<std::string, size_t>* ref_cnts) { - std::deque<std::shared_ptr<memory::Allocation>> garbages; - - auto handler = [&](const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto it = ref_cnts->find(name); - if (it == ref_cnts->end()) continue; - if (--(it->second) != 0) { - continue; - } - auto* var = scope.FindVar(name); - if (var == nullptr) { - continue; - } - - VLOG(2) << "Erase variable " << name; - if (var->IsType<LoDTensor>()) { - garbages.emplace_back( - var->GetMutable<LoDTensor>()->MoveMemoryHolder()); - } else if (var->IsType<SelectedRows>()) { - garbages.emplace_back(var->GetMutable<SelectedRows>() - ->mutable_value() - ->MoveMemoryHolder()); - } else if (var->IsType<LoDTensorArray>()) { - auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>(); - for (auto& t : *lod_tensor_arr) { - garbages.emplace_back(t.MoveMemoryHolder()); - } - } else { - PADDLE_THROW("Type %s of %s is not supported eager deletion", - framework::ToTypeName(var->Type()), name); - } - } - } - }; - - handler(op->Inputs()); - handler(op->Outputs()); - - if (!garbages.empty()) { - gc->Add(std::move(garbages)); - } -} - Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -362,8 +289,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( const ProgramDesc& program, int block_id, const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) { - std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext( - program, block_id, skip_ref_cnt_vars, force_disable_gc)); + std::unique_ptr<ExecutorPrepareContext> ctx( + new ExecutorPrepareContext(program, block_id)); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -375,6 +302,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ctx->prog_.Block(ctx->block_id_), &ctx->ops_); } #endif + ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc); return ctx; } @@ -389,19 +317,17 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( std::vector<std::shared_ptr<ExecutorPrepareContext>> result; size_t idx = 0; for (auto& bid : block_ids) { - ExecutorPrepareContext* ctx; - if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(), - force_disable_gc); - } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx], - force_disable_gc); - } PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size()); + auto* ctx = new ExecutorPrepareContext(program, bid); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } + if (skip_ref_cnt_vars.empty()) { + ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc); + } else { + ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc); + } result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx)); ++idx; } @@ -425,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, // FIXME(zjl): recurrent_op is rather complex, we would // disable gc forcely in recurrent_op if (!ctx->force_disable_gc_ && max_memory_size >= 0) { - ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { @@ -453,8 +378,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, op->Run(*local_scope, place_); if (gc) { - DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->runtime_ref_cnts_)); + DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); } } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 65cb9e51ab2c9208b6bfbbed54f4136ffbd627ff..825224437e0cdda03c56faf1b50833abd8b8c2ab 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -30,22 +30,20 @@ namespace paddle { namespace framework { struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, - const std::vector<std::string>& skip_ref_cnt_vars = - std::vector<std::string>(), - bool force_disable_gc = false); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); ~ExecutorPrepareContext(); - void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } + void PrepareUnusedVars(const std::vector<std::string>& keep_vars, + bool force_disable_gc = false); const framework::ProgramDesc& prog_; - size_t block_id_; - bool force_disable_gc_; + const size_t block_id_; + std::vector<std::unique_ptr<OperatorBase>> ops_; - std::unordered_map<std::string, size_t> global_ref_cnts_; - std::unordered_map<std::string, size_t> runtime_ref_cnts_; + std::unordered_map<OperatorBase*, std::vector<std::string>> unused_vars_; + bool force_disable_gc_{false}; }; class Executor { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..77b0977b5a47fdf4413e75c4e89cf638949e937f --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include <deque> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> +#include "glog/logging.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +struct OpInOutInfo { + public: + void Build(const OperatorBase *op) { + is_built_ = true; + auto &inferer = op->Info().NoNeedBufferVarsInferer(); + if (inferer) { + no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs()); + + if (no_need_buffer_ins_.empty()) return; + + for (auto &in_name_pair : op->Inputs()) { + if (no_need_buffer_ins_.count(in_name_pair.first) != 0) { + continue; + } + + for (auto &in_arg_name : in_name_pair.second) { + other_args_set_.insert(in_arg_name); + } + } + + for (auto &out_name_pair : op->Outputs()) { + for (auto &out_arg_name : out_name_pair.second) { + other_args_set_.insert(out_arg_name); + } + } + } + } + + bool IsBuilt() const { return is_built_; } + + bool IsInArgBufferNeeded(const std::string &in_arg_name) const { + return no_need_buffer_ins_.empty() || + other_args_set_.count(in_arg_name) != 0; + } + + private: + // A set to record unused buffer input vars of op + std::unordered_set<std::string> no_need_buffer_ins_; + // A set to record other args of op (including in, out) + std::unordered_set<std::string> other_args_set_; + bool is_built_{false}; +}; + +static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block, + const std::unordered_set<std::string> &skip_vars) { + if (skip_vars.count(name) != 0) { + return false; + } + + auto *var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) { + return false; + } + + auto type = var_desc->Proto()->type().type(); + + return type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY; +} + +std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars( + const BlockDesc &block, + const std::vector<std::unique_ptr<OperatorBase>> &ops, + const std::vector<std::string> &skip_var_list) { + std::unordered_set<std::string> skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + std::unordered_map<std::string, size_t> var_op_idx_map; + + for (size_t i = 0; i < ops.size(); ++i) { + auto *op = ops[i].get(); + + OpInOutInfo info; + for (auto &name_pair : op->Inputs()) { + for (auto &name : name_pair.second) { + if (!VarCanBeDeleted(name, block, skip_vars)) { + continue; + } + + // var can be gc-ed + if (!info.IsBuilt()) { + info.Build(op); + } + + if (info.IsInArgBufferNeeded(name)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } else { + VLOG(10) << "Skip reference count computing of variable " + << name_pair.first << "(" << name << ") in Operator " + << op->Type(); + } + } + } + + for (auto &name_pair : op->Outputs()) { + for (auto &name : name_pair.second) { + if (VarCanBeDeleted(name, block, skip_vars)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } + } + } + } + + std::unordered_map<OperatorBase *, std::vector<std::string>> result; + for (auto &name_op_idx_pair : var_op_idx_map) { + auto &name = name_op_idx_pair.first; + size_t op_idx = name_op_idx_pair.second; + result[ops[op_idx].get()].emplace_back(name); + } + return result; +} + +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map<OperatorBase *, std::vector<std::string>> + &delete_vars_map, + GarbageCollector *gc) { + auto iter = delete_vars_map.find(op); + if (iter == delete_vars_map.end()) { + return; + } + + auto &delete_vars = iter->second; + + std::deque<std::shared_ptr<memory::Allocation>> garbages; + + for (auto &var_name : delete_vars) { + auto *var = scope.FindVar(var_name); + if (var == nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << var_name; + if (var->IsType<LoDTensor>()) { + garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder()); + } else if (var->IsType<SelectedRows>()) { + garbages.emplace_back( + var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType<LoDTensorArray>()) { + auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>(); + for (auto &t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); + } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + framework::ToTypeName(var->Type()), var_name); + } + } + + if (!garbages.empty()) { + gc->Add(std::move(garbages)); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..8553273f8242844d0203d7bcd90ea2090b65826c --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +// Result map: op -> variable names that can be deleted after op runs +std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars( + const BlockDesc &block, + const std::vector<std::unique_ptr<OperatorBase>> &ops, + const std::vector<std::string> &skip_vars); + +// Collect unused tensors after op runs +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map<OperatorBase *, std::vector<std::string>> + &delete_vars_map, + GarbageCollector *gc); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 54d9d0dc018b08decb2ff8965659bab98e81f3ab..789b2ef80ec09a69ca227a27c61dd58e58a2fc04 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,14 +13,36 @@ // limitations under the License. #include <algorithm> +#include <deque> +#include <functional> +#include <memory> +#include <mutex> // NOLINT +#include <utility> #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "gflags/gflags.h" +#include "glog/logging.h" #include "paddle/fluid/framework/garbage_collector.h" namespace paddle { namespace framework { +DEFINE_double( + eager_delete_tensor_gb, -1.0, + "Memory size threshold (GB) when the garbage collector clear tensors." + "Disabled when this value is less than 0"); + +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + GarbageCollector::GarbageCollector(const platform::Place &place, size_t max_memory_size) : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) { @@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback( callback_manager_->AddCallback(callback); } #endif + +int64_t GetEagerDeletionThreshold() { + return FLAGS_eager_delete_tensor_gb < 0 + ? -1 + : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb * + (static_cast<int64_t>(1) << 30)); +} + +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) { + FLAGS_eager_delete_tensor_gb = threshold; + FLAGS_memory_fraction_of_eager_deletion = fraction; + FLAGS_fast_eager_deletion_mode = fast_mode; +} + +double GetEagerDeletionMemoryFraction() { + return FLAGS_memory_fraction_of_eager_deletion; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 2768671029c06562aa0d2e5eea3d3ff61d900ab5..f0b504627ae0cd99c8b4b15df3dcfc39a56507f2 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -18,6 +18,8 @@ #include <functional> #include <memory> #include <mutex> // NOLINT +#include <utility> +#include "gflags/gflags.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) { } } +int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode); + +double GetEagerDeletionMemoryFraction(); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..2c933659840d02e65c3b222144a31e558e8e8ae8 --- /dev/null +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <string> +#include <unordered_set> +#include <vector> +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace framework { + +class NoNeedBufferVarsInference { + public: + NoNeedBufferVarsInference(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs) + : inputs_(inputs), outputs_(outputs), attrs_(attrs) {} + + virtual ~NoNeedBufferVarsInference() = default; + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + const AttributeMap &Attrs() const { return attrs_; } + + virtual std::unordered_set<std::string> operator()() const = 0; + + private: + const VariableNameMap &inputs_; + const VariableNameMap &outputs_; + const AttributeMap &attrs_; +}; + +#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \ + class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \ + public: \ + using ::paddle::framework::NoNeedBufferVarsInference:: \ + NoNeedBufferVarsInference; \ + \ + std::unordered_set<std::string> operator()() const override { \ + return {__VA_ARGS__}; \ + } \ + } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 4b55bd0703eee399cd841f90ea0b18d8fbdc67e8..e200d188b3f2462657bbac086d7659b1f85e55e9 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include <unordered_map> #include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/macros.h" @@ -39,6 +40,7 @@ struct OpInfo { InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; InferInplaceOpFN infer_inplace_; + InferNoNeedBufferVarsFN infer_no_need_buffer_vars_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; @@ -64,6 +66,10 @@ struct OpInfo { } const OpAttrChecker* Checker() const { return checker_; } + + const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const { + return infer_no_need_buffer_vars_; + } }; class OpInfoMap { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7e745ea41affdf6fd82c30d79bea7d8e2c23ead2..eef84d17a4b9ec54f77e60b09f3a13b151794bea 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include <algorithm> #include <sstream> #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" @@ -326,7 +327,12 @@ OperatorBase::OperatorBase(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) - : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) { + : type_(type), + inputs_(inputs), + outputs_(outputs), + attrs_(attrs), + // NOTE(zjl): why op_info may be nullptr? + info_(OpInfoMap::Instance().GetNullable(type)) { GenerateTemporaryNames(); CheckAllInputOutputSet(); } @@ -350,7 +356,7 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const { } return ret_val; } - auto& info = OpInfoMap::Instance().Get(Type()); + auto& info = Info(); // get all OpProto::Var for outputs for (auto& o : info.Proto().outputs()) { @@ -366,18 +372,16 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const { } void OperatorBase::CheckAllInputOutputSet() const { - auto& info_map = OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) return; + if (info_ == nullptr || info_->proto_ == nullptr) return; - for (auto& in : op_info->Proto().inputs()) { + for (auto& in : info_->Proto().inputs()) { if (!in.dispensable()) { PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), "Operator %s's input, %s, is not set", Type(), in.name()); } } - for (auto& out : op_info->Proto().outputs()) { + for (auto& out : info_->Proto().outputs()) { if (!out.dispensable()) { PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), "Operator %s's output, %s, is not set", Type(), @@ -997,7 +1001,27 @@ Scope* OperatorWithKernel::PrepareData( std::vector<std::string>* transfered_inplace_vars, RuntimeContext* ctx) const { Scope* new_scope = nullptr; + + std::unordered_set<std::string> no_buffer_ins; + if (info_) { + auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer(); + // Some op may not register NoNeedBufferVarsInferer + if (no_buffer_inferer) { + no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs()); + } + } + for (auto& var_name_item : Inputs()) { + // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set + // is empty. At least STL implemented on my mac does calculate hash code + // of search key even though the set is empty. + if (!no_buffer_ins.empty() && + no_buffer_ins.count(var_name_item.first) > 0) { + VLOG(1) << "Skip scanning input " << var_name_item.first + << " in Operator " << type_; + continue; + } + std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first]; for (size_t i = 0; i < var_name_item.second.size(); ++i) { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 684960c235b334f605553c4daed8fb7653be121b..a02e53dcf764368601646a900833ac650c5bb31a 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -160,6 +160,11 @@ class OperatorBase { const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Outputs() const { return outputs_; } + const OpInfo& Info() const { + PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_); + return *info_; + } + bool HasInputs(const std::string& name) const; //! Get a input with argument's name described in `op_proto` std::string Input(const std::string& name) const; @@ -194,6 +199,10 @@ class OperatorBase { // IG (Inputs Gradients) VariableNameMap outputs_; AttributeMap attrs_; + + // OpInfo + const OpInfo* info_; + // Whether this operator executes in an Executor. bool run_by_executor_{true}; @@ -444,7 +453,7 @@ class OperatorWithKernel : public OperatorBase { } virtual void InferShape(InferShapeContext* ctx) const { - OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); + Info().infer_shape_(ctx); } void RuntimeInferShape(const Scope& scope, const platform::Place& place, diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index d79bf25518bbe624f2913839ec7d7d80816b3b69..a96baaf41f3fcd24817421a7b620343558cd78d1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -29,15 +29,6 @@ DEFINE_bool( "Delete local scope eagerly. It will reduce GPU memory usage but " "slow down the destruction of variables.(around 1% performance harm)"); -DEFINE_double( - eager_delete_tensor_gb, -1.0, - "Memory size threshold (GB) when the garbage collector clear tensors." - "Disabled when this value is less than 0"); - -DEFINE_bool(fast_eager_deletion_mode, true, - "Fast eager deletion mode. If enabled, memory would release " - "immediately without waiting GPU kernel ends."); - // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -57,15 +48,6 @@ DEFINE_bool(fast_eager_deletion_mode, true, namespace paddle { namespace framework { -int64_t GetEagerDeletionThreshold() { - return FLAGS_eager_delete_tensor_gb < 0 - ? -1 - : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb * - (static_cast<int64_t>(1) << 30)); -} - -bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } - Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index f0915d2eee072b0bcd53f37dad5ef9d801c87172..242cbae7163c48fa44dca9237f1cd35f9ec98442 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -32,9 +32,6 @@ extern "C" { namespace paddle { namespace framework { -int64_t GetEagerDeletionThreshold(); -bool IsFastEagerDeletionModeEnabled(); - class Scope; /** diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 6ea8e7203a2b453e72a8c03cea7bb146bdcf96b4..4ae6a272d5b043f25015ad8d5cfc2139d394ed5c 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -30,6 +30,7 @@ class InferShapeContext; class InferVarTypeContext; class BlockDesc; class Variable; +class NoNeedBufferVarsInference; using VariableNameMap = std::map<std::string, std::vector<std::string>>; // TODO(panyx0718): Replace vector with something like gtl::Vector. @@ -61,5 +62,9 @@ using InferShapeFN = std::function<void(InferShapeContext*)>; using InplacePair = std::unordered_map<std::string, std::string>; using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>; +using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>( + const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/, + const AttributeMap& /*attrs*/)>; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc index 8127e554bed1aae7a5ce8837bcadf1b7f13f1ac2..3882bbedaa0be0ba14bca9c4fcb626d5ecaab129 100644 --- a/paddle/fluid/operators/add_position_encoding_op.cc +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/add_position_encoding_op.h" +#include <memory> namespace paddle { namespace operators { @@ -39,13 +40,8 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Out@GRAD must not be null."); - - auto out_dims = ctx->GetInputDim("Out"); if (ctx->HasOutput(framework::GradVarName("X"))) { + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); ctx->SetOutputDim(framework::GradVarName("X"), out_dims); } } @@ -75,6 +71,22 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class AddPositionEncodingGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("add_position_encoding_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -83,7 +95,7 @@ namespace plt = paddle::platform; REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp, ops::AddPositionEncodingOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::AddPositionEncodingGradOpDescMaker); REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index a679f7e2536a0a44148193f423f5ffe11b5e35fc..4fc6ae365ec61326670775ab13b854235f19266f 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/clip_op.h" +#include <memory> namespace paddle { namespace operators { @@ -76,12 +77,28 @@ class ClipOpGrad : public framework::OperatorWithKernel { } }; +class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("clip_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ClipGradOpDescMaker); REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 6e3c9f28649b9f15a2a78fc832ab5e52986fcf46..1f71555180361a1522b7a1c8383fe128bc4edcd0 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" +#include <memory> #include <string> #include <vector> @@ -120,11 +121,7 @@ Examples: class ConcatOpGrad : public framework::OperatorWithKernel { public: - ConcatOpGrad(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} + using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { auto in_x = "X"; @@ -142,6 +139,33 @@ class ConcatOpGrad : public framework::OperatorWithKernel { } } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference, + "X"); + +class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("concat_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; + } }; } // namespace operators @@ -149,9 +173,9 @@ class ConcatOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, - paddle::framework::DefaultGradOpDescMaker< - false> /* set false to disable empty grad */); -REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad); + ops::ConcatGradOpDescMaker); +REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, + ops::ConcatOpGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL( concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index c6121d00dae4007f2fcaf57b0945d3f34233781d..619e12e6ba7c73e46beafadd50770aedfb52c964 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -455,13 +455,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( return type; } -class Conv2dGradMaker : public framework::SingleGradOpDescMaker { +class Conv2DGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; std::unique_ptr<framework::OpDesc> Apply() const override { auto* op = new framework::OpDesc(); - op->SetType(GradOpType()); + op->SetType(this->ForwardOpType() + "_grad"); op->SetInput("Input", Input("Input")); op->SetInput("Filter", Input("Filter")); op->SetInput("Bias", Input("Bias")); @@ -470,14 +470,33 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker { op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); - op->SetAttrMap(Attrs()); return std::unique_ptr<framework::OpDesc>(op); } +}; + +class Conv3DGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - virtual std::string GradOpType() const { - return this->ForwardOpType() + "_grad"; + std::unique_ptr<framework::OpDesc> Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + + if (ForwardOp().Inputs().count("ResidualData") != 0) { + op->SetInput("ResidualData", Input("ResidualData")); + } + + op->SetAttrMap(Attrs()); + + return std::unique_ptr<framework::OpDesc>(op); } }; @@ -486,17 +505,16 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, - ops::ConvOpInferVarType, ops::Conv2dGradMaker); + ops::ConvOpInferVarType, ops::Conv2DGradMaker); REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); // depthwise convolution op REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, - ops::ConvOpInferVarType, ops::Conv2dGradMaker); + ops::ConvOpInferVarType, ops::Conv2DGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, - ops::ConvOpInferVarType, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ConvOpInferVarType, ops::Conv3DGradMaker); REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); // depthwise conv kernel diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 97d20681b8136c13d512c0b86a7ff15b24367db2..78fcd07e1df8d590ad2a4508bbc82477d928c6e9 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/crop_op.h" -#include <boost/lexical_cast.hpp> +#include <memory> +#include <string> +#include <vector> namespace paddle { namespace operators { @@ -178,12 +180,31 @@ class CropOpGrad : public framework::OperatorWithKernel { } }; +class CropGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("crop_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("Offsets") > 0) { + op->SetInput("Offsets", Input("Offsets")); + } + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::CropGradOpDescMaker); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); REGISTER_OP_CPU_KERNEL( crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a617b9fb1d948340d25853252be79fdd08fe0438..ad32de53e7019b438b7106ddd031a8f00bd79b5d 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -238,6 +238,23 @@ class CrossEntropyGradientOp : public CrossEntropyGradientOpBase { } }; +class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("cross_entropy_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + class CrossEntropyOp2 : public CrossEntropyOpBase { public: using CrossEntropyOpBase::CrossEntropyOpBase; @@ -354,7 +371,7 @@ using CPUCtx = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase, ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::CrossEntropyGradOpDescMaker); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, ops::CrossEntropyOpKernel<CPUCtx, double>); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index e63d57be57a66e8e02f7ef88acd01246302bc53c..134f84d59cafa661fce727adc3303444c4ef483e 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include <memory> #include <string> #include "paddle/fluid/framework/op_registry.h" @@ -170,11 +171,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input(Input) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("last_h"), - "Input(last_h) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("last_c"), - "Input(last_c) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Cache"), "Input(last_c) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("InitH"), @@ -197,6 +193,35 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { } }; +class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("cudnn_lstm_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("InitH", Input("InitH")); + op->SetInput("InitC", Input("InitC")); + op->SetInput("W", Input("W")); + if (ForwardOp().Inputs().count("Cache") > 0) { + op->SetInput("Cache", Input("Cache")); + } + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c")); + op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("W"), InputGrad("W")); + op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH")); + op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC")); + op->SetAttrMap(Attrs()); + return op; + } +}; + template <typename T> class NotImpleKernel : public framework::OpKernel<T> { public: @@ -211,7 +236,7 @@ class NotImpleKernel : public framework::OpKernel<T> { namespace ops = paddle::operators; REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::CudnnLSTMGradOpDescMaker); REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>); diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index c63d65348880ebb4085d83059d9fead6456216d7..65295c2c103ceca50d9de3ae314246256497d084 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -14,6 +14,7 @@ #include <set> #include <string> +#include <unordered_map> #include <vector> #include "paddle/fluid/operators/distributed/parameter_prefetch.h" @@ -218,7 +219,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, boost::get<platform::CUDAPlace>(id_tensor.place()), id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(), stream); - for (size_t i = 0; i < cpu_tensor.numel(); ++i) { + for (int64_t i = 0; i < cpu_tensor.numel(); ++i) { ids_vector.push_back(cpu_tensor_data[i]); } #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index c6c658236c235f0a6767924026b0a7610071e918..2b3fc06dcb79b8c6b46de7abf51bdb2c47acca1c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); -REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", - "X"); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y"); REGISTER_OP_CPU_KERNEL( elementwise_add, diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index eba8fbc1db63cc2b0761455e07856d80760c805a..6dbb9072495f743a4df1ff05e029a227c2cf618b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -272,12 +272,11 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y"); + } // namespace operators } // namespace paddle -/* -*/ - #define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \ class kernel_type##GradMaker \ : public paddle::framework::SingleGradOpDescMaker { \ @@ -311,18 +310,19 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference { ::paddle::framework::DefaultGradOpDescMaker<true>); \ REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) -#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \ - class __ElemwiseOp##op_type##Maker__ \ - : public ::paddle::operators::ElementwiseOpMaker { \ - protected: \ - virtual std::string GetName() const { return op_name; } \ - virtual std::string GetEquation() const { return equation; } \ - }; \ - REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ - __ElemwiseOp##op_type##Maker__, \ - ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker, \ - ::paddle::operators::ElementwiseOpInplace); \ - REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad, \ - ::paddle::operators::ElementwiseGradOpInplace) +#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation) \ + class __ElemwiseOp##op_type##Maker__ \ + : public ::paddle::operators::ElementwiseOpMaker { \ + protected: \ + virtual std::string GetName() const { return op_name; } \ + virtual std::string GetEquation() const { return equation; } \ + }; \ + REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ + __ElemwiseOp##op_type##Maker__, \ + ::paddle::operators::ElementwiseOpInferVarType, \ + op_type##GradMaker, \ + ::paddle::operators::ElementwiseOpInplace); \ + REGISTER_OPERATOR(op_type##_grad, \ + ::paddle::operators::ElementwiseOpExplicitGrad, \ + ::paddle::operators::ElementwiseGradOpInplace, \ + ::paddle::operators::ElementwiseGradNoBufVarsInference) diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index efc66374c812cbd07adef6ac25c9616b880ec383..04c87c1b2ac398f8f75265c80bef5326aea15dce 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); -REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out", - "X"); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y"); REGISTER_OP_CPU_KERNEL( elementwise_sub, diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 55cef93aacd43174edefbb8aa740bcbea3d8feef..91f3818f2165c91eef88921859afe5703bd65685 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gather_op.h" +#include <memory> +#include <string> +#include <vector> #include "paddle/fluid/framework/ddim.h" namespace paddle { @@ -59,8 +62,9 @@ class GatherGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -94,13 +98,34 @@ Out = [[3, 4], )DOC"); } }; + +class GatherGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("gather_grad"); + op->SetInput("Index", Input("Index")); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherGradNoNeedBufferVarInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); + ops::GatherGradOpDescMaker); +REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, + ops::GatherGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>, ops::GatherOpKernel<double>, ops::GatherOpKernel<int>, ops::GatherOpKernel<uint8_t>, diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index a814c365d70ae91490e7fb50a0baf8fec05d97ef..e0ab02cd90cdee848250a6aba882b0cb0c17abd7 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lod_reset_op.h" +#include <memory> namespace paddle { namespace operators { @@ -146,18 +147,39 @@ class LoDResetGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; +class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("lod_reset_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp); + ops::LoDResetGradDescMaker); +REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, + ops::LoDResetGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL( lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>, ops::LoDResetKernel<paddle::platform::CPUPlace, double>, diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 740cd5219c70331d1f71d832adef084c148a2408..0860fb845976c02562a181139e27bd1912a7c179 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -21,6 +21,7 @@ #include <cstdlib> #include <fstream> #include <iostream> +#include <memory> #include <sstream> #include <string> #include <unordered_map> @@ -152,7 +153,7 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread(std::bind( &ReadThread, file_groups_[thread_id], data_desc_, static_cast<int>(thread_id), &read_thread_status_, queue_))); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d15eadcf3073af9b65558b57af7de0389649cfb9..7bf089637862c969d27f957a630469d1644222bf 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -139,6 +140,7 @@ PYBIND11_MODULE(core, m) { paddle::platform::CpuTotalPhysicalMemory(); paddle::memory::allocation::UseAllocatorStrategyGFlag(); + m.doc() = "C++ core of PaddlePaddle"; // using framework in this function. Since it is inside a function, it will @@ -153,6 +155,11 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); + // NOTE(zjl): ctest would load environment variables at the beginning even + // though we have not `import paddle.fluid as fluid`. So we add this API + // to enable eager deletion mode in unittest. + m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode); + m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); @@ -281,6 +288,8 @@ PYBIND11_MODULE(core, m) { py::class_<Tensor>(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) + .def("_is_initialized", + [](const Tensor &self) { return self.IsInitialized(); }) .def("_get_dims", [](const Tensor &self) { return vectorize(self.dims()); }) .def("_set_dims", @@ -681,7 +690,8 @@ All parameter, weight, gradient are variables in Paddle. .def("drop_kids", &Scope::DropKids, R"DOC( Delete all sub-scopes of the current scope. - )DOC"); + )DOC") + .def("_kids", &Scope::kids); m.def("Scope", []() -> Scope * { diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py new file mode 100644 index 0000000000000000000000000000000000000000..adf07897d561cf49c70841c5a4114b51b4cf55f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py @@ -0,0 +1,183 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +os.environ['FLAGS_use_ngraph'] = '0' +os.environ['FLAGS_use_mkldnn'] = '0' +os.environ['CPU_NUM'] = '4' + +import paddle.fluid as fluid +import six +import unittest +import multiprocessing + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + + +def simple_fc_net(): + image = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = image + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + optimizer = fluid.optimizer.Adam(learning_rate=1e-3) + optimizer.minimize(loss) + return image, label, loss + + +def get_persistables_and_non_persistables(prog, fetch_list): + num_block = prog.num_blocks + persitables = set() + non_persistables = set() + for bid in six.moves.range(num_block): + block = prog.block(bid) + for _, var in block.vars.items(): + if var.persistable or var.name in fetch_list: + persitables.add(var.name) + else: + non_persistables.add(var.name) + + return persitables, non_persistables + + +class TestExecutor(unittest.TestCase): + def test_executor_main(self): + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for p in places: + self.place = p + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.Scope()): + with fluid.unique_name.guard(): + self.executor_main() + + for p in places: + self.place = p + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.Scope()): + with fluid.unique_name.guard(): + self.pe_main() + + def prepare_feed(self, image, label, dev_cnt=1): + batch_size = 32 * dev_cnt + image_shape = (batch_size, ) + tuple(image.shape[1:]) + label_shape = (batch_size, ) + tuple(label.shape[1:]) + + image_np = np.random.random(size=image_shape).astype('float32') + label_np = np.random.random_integers( + low=0, high=9, size=label_shape).astype('int64') + + return image_np, label_np + + def assertScopeVar(self, scope, persitables, non_persistables): + outline_p_vars = [] + for name in persitables: + var = scope.find_var(name) + self.assertTrue(var is not None) + t = var.get_tensor() + if not t._is_initialized(): + outline_p_vars.append(name) + + outline_np_vars = [] + for name in non_persistables: + var = scope.find_var(name) + self.assertTrue(var is not None) + t = var.get_tensor() + if t._is_initialized(): + outline_np_vars.append(name) + + print('Non-alive persistable vars {} in {}'.format(outline_p_vars, + persitables)) + print('Alive non-persistable vars {} in {}'.format(outline_np_vars, + non_persistables)) + self.assertEqual(len(outline_p_vars), 0) + self.assertEqual(len(outline_np_vars), 0) + + def executor_main(self): + image, label, loss = simple_fc_net() + loss.persistable = False + persistables, non_persistables = get_persistables_and_non_persistables( + fluid.default_main_program(), [loss.name]) + print('Non-persistable var number {}'.format(len(non_persistables))) + print(non_persistables) + + exe = fluid.Executor(self.place) + exe.run(fluid.default_startup_program()) + + p = fluid.core.Place() + p.set_place(self.place) + exe = fluid.core.Executor(p) + + for _ in six.moves.range(10): + image_np, label_np = self.prepare_feed(image, label) + fluid.global_scope().var(image.name).get_tensor().set(image_np, + self.place) + fluid.global_scope().var(label.name).get_tensor().set(label_np, + self.place) + # exe.run would not create local scope + # so that we can detect whether gc clears temporary variables + exe.run(fluid.default_main_program().desc, + fluid.global_scope(), 0, False, True, [loss.name]) + self.assertScopeVar(fluid.global_scope(), persistables, + non_persistables) + + def pe_main(self): + image, label, loss = simple_fc_net() + loss.persistable = False + persitables, non_persistables = get_persistables_and_non_persistables( + fluid.default_main_program(), [loss.name]) + + exe = fluid.Executor(self.place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_iteration_per_drop_scope = 100 + + build_strategy = fluid.BuildStrategy() + build_strategy.memory_optimize = False + build_strategy.enable_inplace = False + + prog = fluid.CompiledProgram(fluid.default_main_program( + )).with_data_parallel( + loss_name=loss.name, exec_strategy=exec_strategy) + + dev_cnt = fluid.core.get_cuda_device_count() if isinstance(self.place, fluid.CUDAPlace) \ + else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + for idx in six.moves.range(10): + image_np, label_np = self.prepare_feed(image, label, dev_cnt) + feed = {image.name: image_np, label.name: label_np} + + exe.run(program=prog, feed=feed, fetch_list=[loss]) + + local_scopes = prog._local_scopes + for scope in local_scopes: + kids = scope._kids() + self.assertTrue(len(kids) == 1) + self.assertScopeVar(kids[0], persistables, non_persistables) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index 910f53a91a7b5ca1413adf9505ed2c3ad3d56dad..d4c043d9c76f21482f17b9bb20c4fde5ce7cc6e7 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -13,7 +13,6 @@ # limitations under the License. import os -os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' os.environ['CPU_NUM'] = '2' import six diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index 5ed3d9fdf3bf765f1b9ef8ba1ef2a5795f1874c7..1023c18f410fb60592154bbdf421d58aa88c71ae 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -16,6 +16,8 @@ import unittest from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + def gru_net(data, label, diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 8462c06aa56e0469fd06c7dc4b2ed514f7eb51ba..6784edb9d7b2e9cd95f8646e9f8a210296dac94e 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -16,6 +16,8 @@ from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid import unittest +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + def lstm_net(data, label, diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py index 56dfb095def62bc617948821038f0c15c1547683..ecdf9efa451743f8368079183fcb33f1769a6ab5 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -14,7 +14,9 @@ import os import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +import paddle.fluid as fluid + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) # FIXME(zjl): It seems that this unittest fails randomly # when comparing all reduce last loss and reduce last loss diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 05cc41b96f1992718c21eb5d7d2605dd8d3b2218..44568ff66b61affdd5be809e23ba09597645d470 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -14,7 +14,9 @@ import os import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +import paddle.fluid as fluid + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio' diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 898d04ebe1c9c2c3a336aeca07ab6ce79a890e0a..581f7eff896791da33e179bb8a10f7742aa2d05e 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -16,8 +16,6 @@ from __future__ import print_function import os os.environ['CPU_NUM'] = '2' -os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' -os.environ['FLAGS_fast_eager_deletion_mode'] = '1' import unittest import paddle.fluid as fluid @@ -29,6 +27,8 @@ import paddle.fluid.compiler as compiler import numpy import multiprocessing +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + class TestEagerDeletionWhileOpBase(unittest.TestCase): def test_main(self): diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index 7607189454b2264523176b6853fd9debddf47eed..ef06e7d9fcf7597c721b19a1e13647471c83e7a6 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -14,11 +14,12 @@ import os import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55" +import paddle.fluid as fluid os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio' +fluid.core._set_eager_deletion_mode(0.0, 0.55, True) + from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py index 1a252ea547e4d93d83f64fa9cdb3605eeef0a3cf..aad2eaed94a356d06afb7cd461eecefa2de98d8c 100644 --- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py +++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py @@ -168,3 +168,7 @@ class TestROIAlignOp(OpTest): def test_check_grad(self): self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main()