未验证 提交 c7c6eeb4 编写于 作者: Z Zeng Jinle 提交者: GitHub

Merge pull request #16409 from sneaxiy/feature/advance_gc

Enhance gc to support deleting tensor buffer in advance
......@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader)
......@@ -164,6 +164,8 @@ else()
set(NGRAPH_EXE_DEPS)
endif()
cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
......@@ -174,7 +176,7 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
target_link_libraries(executor garbage_collector while_op_helper)
target_link_libraries(executor while_op_helper executor_gc_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
......@@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
......@@ -22,14 +22,9 @@
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted.");
namespace paddle {
namespace framework {
namespace details {
......@@ -206,8 +201,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
}
}
op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
FLAGS_memory_fraction_of_eager_deletion);
double memory_fraction = framework::GetEagerDeletionMemoryFraction();
op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
for (auto &pair : op_vars_map) {
auto *op = pair.first;
......@@ -239,8 +235,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
eager_deletion_op->AddOutput(dummy_leaf);
}
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
<< FLAGS_memory_fraction_of_eager_deletion;
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
auto while_op_eager_deletion_pass =
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
namespace details {
class EarlyDeleteOpHandle : public OpHandleBase {
public:
EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
const platform::Place& place,
const std::vector<std::string>& names,
GarbageCollector* gc)
: OpHandleBase(node),
scope_(scope),
place_(place),
names_(names),
gc_(gc) {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(place);
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
}
#endif
}
~EarlyDeleteOpHandle() {
#ifdef PADDLE_WITH_CUDA
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventDestroy(event_));
}
#endif
}
std::string Name() const override { return "early_delete"; }
protected:
void RunImpl() override {
std::vector<std::shared_ptr<memory::Allocation>> tensors;
auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
for (auto& var_name : names_) {
auto* var = local_scope->FindVar(var_name);
PADDLE_ENFORCE(var != nullptr,
string::Sprintf("Local Scope not has var %s", var_name));
if (var->IsType<LoDTensor>()) {
tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
} else if (var->IsType<SelectedRows>()) {
tensors.emplace_back(var->GetMutable<SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
} else if (var->IsType<LoDTensorArray>()) {
LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
for (auto& tensor : *tensor_array) {
tensors.emplace_back(tensor.MoveMemoryHolder());
}
}
}
if (!tensors.empty()) {
ClearTensors(tensors);
}
}
private:
void ClearTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
if (platform::is_cpu_place(place_)) {
ClearCPUTensors(tensors);
} else {
ClearGPUTensors(tensors);
}
}
void ClearCPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
if (gc != nullptr) {
gc->Add(tensors);
}
}
void ClearGPUTensors(
const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
#ifdef PADDLE_WITH_CUDA
auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
if (gc != nullptr) {
auto compute_stream = dev_ctx_->stream();
auto callback_stream = gc->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(tensors, callback_func);
} else {
gc_->Add(tensors);
}
}
bool IsStreamGarabageCollector() const {
return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
#endif
}
const Scope* scope_;
const platform::Place place_;
std::vector<std::string> names_;
GarbageCollector* gc_;
#ifdef PADDLE_WITH_CUDA
platform::CUDADeviceContext* dev_ctx_;
cudaEvent_t event_;
#endif
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
......@@ -36,27 +37,86 @@ enum OpInfoFillType {
kGradOpDescMaker = 2,
kVarTypeInference = 3,
kShapeInference = 4,
kInplaceOpInference = 5
kInplaceOpInference = 5,
kNoNeedBufferVarsInference = 6,
kUnknown = -1
};
namespace internal {
template <typename T, OpInfoFillType kType>
struct TypePair {
using Type = T;
static constexpr OpInfoFillType kFillType = kType;
};
using OpRegistryClasses = std::tuple< // NOLINT
TypePair<OperatorBase, kOperator>, // NOLINT
TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>, // NOLINT
TypePair<GradOpDescMakerBase, kGradOpDescMaker>, // NOLINT
TypePair<VarTypeInference, kVarTypeInference>, // NOLINT
TypePair<InferShapeBase, kShapeInference>, // NOLINT
TypePair<InplaceOpInference, kInplaceOpInference>, // NOLINT
TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference> // NOLINT
>;
static constexpr int kOpRegistryClassNumber =
std::tuple_size<OpRegistryClasses>::value;
template <typename T, int kPos, bool kIsBounded /* = true*/>
struct IsMatchedBaseTypeImpl {
using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
static constexpr bool kValue =
std::is_base_of<typename PairType::Type, T>::value;
};
template <typename T, int kPos>
struct IsMatchedBaseTypeImpl<T, kPos, false> {
static constexpr bool kValue = false;
};
template <typename T, int kPos>
static inline constexpr bool IsMatchedBaseType() {
return IsMatchedBaseTypeImpl<
T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
}
template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
struct OpInfoFillTypeGetterImpl {};
// This case should not happen
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
static constexpr OpInfoFillType kType = kUnknown;
};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
static constexpr OpInfoFillType kType =
OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
IsMatchedBaseType<T, kStart + 1>()>::kType;
};
template <typename T, int kStart, int kEnd>
struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
static constexpr OpInfoFillType kType = PairType::kFillType;
};
template <typename T>
using OpInfoFillTypeGetter =
OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
kOpRegistryClassNumber == 0,
IsMatchedBaseType<T, 0>()>;
} // namespace internal
template <typename T>
struct OpInfoFillTypeID {
static constexpr OpInfoFillType ID() {
return std::is_base_of<OperatorBase, T>::value
? kOperator
: (std::is_base_of<OpProtoAndCheckerMaker, T>::value
? kOpProtoAndCheckerMaker
: (std::is_base_of<GradOpDescMakerBase, T>::value
? kGradOpDescMaker
: (std::is_base_of<VarTypeInference, T>::value
? kVarTypeInference
: (std::is_base_of<InferShapeBase, T>::value
? kShapeInference
: (std::is_base_of<
InplaceOpInference, T>::value
? kInplaceOpInference
: static_cast<OpInfoFillType>(
-1))))));
return internal::OpInfoFillTypeGetter<T>::kType;
}
};
......@@ -156,6 +216,18 @@ struct OpInfoFiller<T, kInplaceOpInference> {
}
};
template <typename T>
struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
void operator()(const char* op_type, OpInfo* info) const {
info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
const VariableNameMap& outputs,
const AttributeMap& attrs) {
T infer(inputs, outputs, attrs);
return infer();
};
}
};
} // namespace details
} // namespace framework
......
......@@ -193,6 +193,79 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
return shrink_func(computation_op);
}
/**
* Shrink op dependencies according to no need buffer vars.
*
* If some ops do not need Tensor buffer of any input,
* just remove the dependency of this op, i.e, decrease reference count.
*
* For example, input Y of elementwise_add_grad op is only used to infer shape
* and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
* input Y can be collected before elementwise_add_grad op runs.
*
* This method returns whether the dependency count decreases to 0, and
* shrinks op dependency if possible.
*/
static bool ShrinkNoNeedBufferVarOpDependency(
const std::string &var_name,
std::unordered_set<ComputationOpHandle *> *op_handles) {
std::vector<ComputationOpHandle *> skip_ops;
for (auto *op_handle : *op_handles) {
auto *op_base = op_handle->GetOp();
auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
if (!inferer) {
continue;
}
std::unordered_set<std::string> no_need_buffer_vars =
inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
// Check whether var_name occurs in other inputs or outputs of the op
// If it occurs, we cannot decrease the dependency number.
bool occurred_in_other_vars = false;
for (auto &in_pair : op_base->Inputs()) {
if (no_need_buffer_vars.count(in_pair.first) > 0) {
continue;
}
auto &args = in_pair.second;
auto iter = std::find(args.begin(), args.end(), var_name);
if (iter != args.end()) {
occurred_in_other_vars = true;
break;
}
}
if (occurred_in_other_vars) {
continue;
}
for (auto &out_pair : op_base->Outputs()) {
auto &args = out_pair.second;
auto iter = std::find(args.begin(), args.end(), var_name);
if (iter != args.end()) {
occurred_in_other_vars = true;
break;
}
}
if (!occurred_in_other_vars) {
VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
skip_ops.emplace_back(op_handle);
}
}
if (skip_ops.size() == op_handles->size()) {
op_handles->clear();
return true;
} else {
for (auto *skip_op : skip_ops) {
op_handles->erase(skip_op);
}
return false;
}
}
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
......@@ -229,17 +302,43 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
continue;
}
bool ok;
auto result = ExtractComputationOpFromLastLivedVar(
name_var_pair.second.back(), i, shrink_func, &ok);
auto &var_name = name_var_pair.first;
auto &var_handles = name_var_pair.second;
for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
++iter) {
bool ok;
auto result =
ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
// Seldomly, some vars may have no pending or preceding computation ops
// Just break;
if (!ok) break;
VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
size_t original_op_deps = result.size();
// If all ops do not need buffer of var_name, calculate reference count
// of the previous version of var_name.
if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
VLOG(10) << "Try to precede reference count computing at var "
<< var_name;
continue;
}
size_t final_op_deps = result.size();
if (final_op_deps < original_op_deps) {
VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
<< final_op_deps;
}
if (ok) {
auto &var_name = name_var_pair.first;
PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
var_name);
ref_cnts[i].emplace(var_name, result.size());
last_live_ops_of_vars[i].emplace(var_name, std::move(result));
}
// Seldomly, all preceding trying failed.
// Just skip this corner case
}
}
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
......@@ -48,97 +49,23 @@ namespace {
int kProgramId = -1;
} // namespace
static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
std::unordered_map<std::string, size_t> ref_cnts;
std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
skip_var_list.end());
auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
if (skip_vars.count(name)) continue;
auto* var_desc = block.FindVar(name);
if (var_desc == nullptr || var_desc->Persistable()) continue;
auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR &&
type != proto::VarType::SELECTED_ROWS &&
type != proto::VarType::LOD_TENSOR_ARRAY) {
continue;
}
++ref_cnts[name];
}
}
};
for (auto op_desc : block.AllOps()) {
update_ref_cnts(op_desc, op_desc->Inputs());
update_ref_cnts(op_desc, op_desc->Outputs());
}
return ref_cnts;
}
ExecutorPrepareContext::ExecutorPrepareContext(
const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& keep_vars, bool force_disable_gc)
: prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
global_ref_cnts_ =
GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {}
void ExecutorPrepareContext::PrepareUnusedVars(
const std::vector<std::string>& keep_vars, bool force_disable_gc) {
force_disable_gc_ = force_disable_gc;
if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
return;
}
unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
}
ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext";
}
static void DeleteUnusedTensors(
const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
std::unordered_map<std::string, size_t>* ref_cnts) {
std::deque<std::shared_ptr<memory::Allocation>> garbages;
auto handler = [&](const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto it = ref_cnts->find(name);
if (it == ref_cnts->end()) continue;
if (--(it->second) != 0) {
continue;
}
auto* var = scope.FindVar(name);
if (var == nullptr) {
continue;
}
VLOG(2) << "Erase variable " << name;
if (var->IsType<LoDTensor>()) {
garbages.emplace_back(
var->GetMutable<LoDTensor>()->MoveMemoryHolder());
} else if (var->IsType<SelectedRows>()) {
garbages.emplace_back(var->GetMutable<SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
} else if (var->IsType<LoDTensorArray>()) {
auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto& t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
framework::ToTypeName(var->Type()), name);
}
}
}
};
handler(op->Inputs());
handler(op->Outputs());
if (!garbages.empty()) {
gc->Add(std::move(garbages));
}
}
Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() {
......@@ -362,8 +289,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
const ProgramDesc& program, int block_id,
const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
program, block_id, skip_ref_cnt_vars, force_disable_gc));
std::unique_ptr<ExecutorPrepareContext> ctx(
new ExecutorPrepareContext(program, block_id));
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) {
......@@ -375,6 +302,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
}
#endif
ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
return ctx;
}
......@@ -389,19 +317,17 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
size_t idx = 0;
for (auto& bid : block_ids) {
ExecutorPrepareContext* ctx;
if (skip_ref_cnt_vars.empty()) {
ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
force_disable_gc);
} else {
ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
force_disable_gc);
}
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto* ctx = new ExecutorPrepareContext(program, bid);
auto& block = program.Block(bid);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
if (skip_ref_cnt_vars.empty()) {
ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc);
} else {
ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc);
}
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
++idx;
}
......@@ -425,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
// FIXME(zjl): recurrent_op is rather complex, we would
// disable gc forcely in recurrent_op
if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
if (IsFastEagerDeletionModeEnabled()) {
......@@ -453,8 +378,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
op->Run(*local_scope, place_);
if (gc) {
DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
&(ctx->runtime_ref_cnts_));
DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
}
}
......
......@@ -30,22 +30,20 @@ namespace paddle {
namespace framework {
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>(),
bool force_disable_gc = false);
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
~ExecutorPrepareContext();
void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
void PrepareUnusedVars(const std::vector<std::string>& keep_vars,
bool force_disable_gc = false);
const framework::ProgramDesc& prog_;
size_t block_id_;
bool force_disable_gc_;
const size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
std::unordered_map<std::string, size_t> global_ref_cnts_;
std::unordered_map<std::string, size_t> runtime_ref_cnts_;
std::unordered_map<OperatorBase*, std::vector<std::string>> unused_vars_;
bool force_disable_gc_{false};
};
class Executor {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/executor_gc_helper.h"
#include <deque>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
struct OpInOutInfo {
public:
void Build(const OperatorBase *op) {
is_built_ = true;
auto &inferer = op->Info().NoNeedBufferVarsInferer();
if (inferer) {
no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
if (no_need_buffer_ins_.empty()) return;
for (auto &in_name_pair : op->Inputs()) {
if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
continue;
}
for (auto &in_arg_name : in_name_pair.second) {
other_args_set_.insert(in_arg_name);
}
}
for (auto &out_name_pair : op->Outputs()) {
for (auto &out_arg_name : out_name_pair.second) {
other_args_set_.insert(out_arg_name);
}
}
}
}
bool IsBuilt() const { return is_built_; }
bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
return no_need_buffer_ins_.empty() ||
other_args_set_.count(in_arg_name) != 0;
}
private:
// A set to record unused buffer input vars of op
std::unordered_set<std::string> no_need_buffer_ins_;
// A set to record other args of op (including in, out)
std::unordered_set<std::string> other_args_set_;
bool is_built_{false};
};
static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
const std::unordered_set<std::string> &skip_vars) {
if (skip_vars.count(name) != 0) {
return false;
}
auto *var_desc = block.FindVar(name);
if (var_desc == nullptr || var_desc->Persistable()) {
return false;
}
auto type = var_desc->Proto()->type().type();
return type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY;
}
std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
const BlockDesc &block,
const std::vector<std::unique_ptr<OperatorBase>> &ops,
const std::vector<std::string> &skip_var_list) {
std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
skip_var_list.end());
std::unordered_map<std::string, size_t> var_op_idx_map;
for (size_t i = 0; i < ops.size(); ++i) {
auto *op = ops[i].get();
OpInOutInfo info;
for (auto &name_pair : op->Inputs()) {
for (auto &name : name_pair.second) {
if (!VarCanBeDeleted(name, block, skip_vars)) {
continue;
}
// var can be gc-ed
if (!info.IsBuilt()) {
info.Build(op);
}
if (info.IsInArgBufferNeeded(name)) {
// Update the last living op of variable to current op
var_op_idx_map[name] = i;
} else {
VLOG(10) << "Skip reference count computing of variable "
<< name_pair.first << "(" << name << ") in Operator "
<< op->Type();
}
}
}
for (auto &name_pair : op->Outputs()) {
for (auto &name : name_pair.second) {
if (VarCanBeDeleted(name, block, skip_vars)) {
// Update the last living op of variable to current op
var_op_idx_map[name] = i;
}
}
}
}
std::unordered_map<OperatorBase *, std::vector<std::string>> result;
for (auto &name_op_idx_pair : var_op_idx_map) {
auto &name = name_op_idx_pair.first;
size_t op_idx = name_op_idx_pair.second;
result[ops[op_idx].get()].emplace_back(name);
}
return result;
}
void DeleteUnusedTensors(
const Scope &scope, OperatorBase *op,
const std::unordered_map<OperatorBase *, std::vector<std::string>>
&delete_vars_map,
GarbageCollector *gc) {
auto iter = delete_vars_map.find(op);
if (iter == delete_vars_map.end()) {
return;
}
auto &delete_vars = iter->second;
std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (auto &var_name : delete_vars) {
auto *var = scope.FindVar(var_name);
if (var == nullptr) {
continue;
}
VLOG(2) << "Erase variable " << var_name;
if (var->IsType<LoDTensor>()) {
garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
} else if (var->IsType<SelectedRows>()) {
garbages.emplace_back(
var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
} else if (var->IsType<LoDTensorArray>()) {
auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto &t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
framework::ToTypeName(var->Type()), var_name);
}
}
if (!garbages.empty()) {
gc->Add(std::move(garbages));
}
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
// Result map: op -> variable names that can be deleted after op runs
std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
const BlockDesc &block,
const std::vector<std::unique_ptr<OperatorBase>> &ops,
const std::vector<std::string> &skip_vars);
// Collect unused tensors after op runs
void DeleteUnusedTensors(
const Scope &scope, OperatorBase *op,
const std::unordered_map<OperatorBase *, std::vector<std::string>>
&delete_vars_map,
GarbageCollector *gc);
} // namespace framework
} // namespace paddle
......@@ -13,14 +13,36 @@
// limitations under the License.
#include <algorithm>
#include <deque>
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/garbage_collector.h"
namespace paddle {
namespace framework {
DEFINE_double(
eager_delete_tensor_gb, -1.0,
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0");
DEFINE_bool(fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends.");
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted.");
GarbageCollector::GarbageCollector(const platform::Place &place,
size_t max_memory_size)
: max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
......@@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback(
callback_manager_->AddCallback(callback);
}
#endif
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1
: static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
(static_cast<int64_t>(1) << 30));
}
bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) {
FLAGS_eager_delete_tensor_gb = threshold;
FLAGS_memory_fraction_of_eager_deletion = fraction;
FLAGS_fast_eager_deletion_mode = fast_mode;
}
double GetEagerDeletionMemoryFraction() {
return FLAGS_memory_fraction_of_eager_deletion;
}
} // namespace framework
} // namespace paddle
......@@ -18,6 +18,8 @@
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
......@@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) {
}
}
int64_t GetEagerDeletionThreshold();
bool IsFastEagerDeletionModeEnabled();
void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode);
double GetEagerDeletionMemoryFraction();
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace framework {
class NoNeedBufferVarsInference {
public:
NoNeedBufferVarsInference(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs)
: inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
virtual ~NoNeedBufferVarsInference() = default;
const VariableNameMap &Inputs() const { return inputs_; }
const VariableNameMap &Outputs() const { return outputs_; }
const AttributeMap &Attrs() const { return attrs_; }
virtual std::unordered_set<std::string> operator()() const = 0;
private:
const VariableNameMap &inputs_;
const VariableNameMap &outputs_;
const AttributeMap &attrs_;
};
#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \
class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \
public: \
using ::paddle::framework::NoNeedBufferVarsInference:: \
NoNeedBufferVarsInference; \
\
std::unordered_set<std::string> operator()() const override { \
return {__VA_ARGS__}; \
} \
}
} // namespace framework
} // namespace paddle
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/macros.h"
......@@ -39,6 +40,7 @@ struct OpInfo {
InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_;
InferInplaceOpFN infer_inplace_;
InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr;
......@@ -64,6 +66,10 @@ struct OpInfo {
}
const OpAttrChecker* Checker() const { return checker_; }
const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
return infer_no_need_buffer_vars_;
}
};
class OpInfoMap {
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <algorithm>
#include <sstream>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
......@@ -326,7 +327,12 @@ OperatorBase::OperatorBase(const std::string& type,
const VariableNameMap& inputs,
const VariableNameMap& outputs,
const AttributeMap& attrs)
: type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
: type_(type),
inputs_(inputs),
outputs_(outputs),
attrs_(attrs),
// NOTE(zjl): why op_info may be nullptr?
info_(OpInfoMap::Instance().GetNullable(type)) {
GenerateTemporaryNames();
CheckAllInputOutputSet();
}
......@@ -350,7 +356,7 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
}
return ret_val;
}
auto& info = OpInfoMap::Instance().Get(Type());
auto& info = Info();
// get all OpProto::Var for outputs
for (auto& o : info.Proto().outputs()) {
......@@ -366,18 +372,16 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
}
void OperatorBase::CheckAllInputOutputSet() const {
auto& info_map = OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(Type());
if (op_info == nullptr || op_info->proto_ == nullptr) return;
if (info_ == nullptr || info_->proto_ == nullptr) return;
for (auto& in : op_info->Proto().inputs()) {
for (auto& in : info_->Proto().inputs()) {
if (!in.dispensable()) {
PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
"Operator %s's input, %s, is not set", Type(), in.name());
}
}
for (auto& out : op_info->Proto().outputs()) {
for (auto& out : info_->Proto().outputs()) {
if (!out.dispensable()) {
PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
"Operator %s's output, %s, is not set", Type(),
......@@ -997,7 +1001,27 @@ Scope* OperatorWithKernel::PrepareData(
std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const {
Scope* new_scope = nullptr;
std::unordered_set<std::string> no_buffer_ins;
if (info_) {
auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
// Some op may not register NoNeedBufferVarsInferer
if (no_buffer_inferer) {
no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs());
}
}
for (auto& var_name_item : Inputs()) {
// NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set
// is empty. At least STL implemented on my mac does calculate hash code
// of search key even though the set is empty.
if (!no_buffer_ins.empty() &&
no_buffer_ins.count(var_name_item.first) > 0) {
VLOG(1) << "Skip scanning input " << var_name_item.first
<< " in Operator " << type_;
continue;
}
std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
......
......@@ -160,6 +160,11 @@ class OperatorBase {
const VariableNameMap& Inputs() const { return inputs_; }
const VariableNameMap& Outputs() const { return outputs_; }
const OpInfo& Info() const {
PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_);
return *info_;
}
bool HasInputs(const std::string& name) const;
//! Get a input with argument's name described in `op_proto`
std::string Input(const std::string& name) const;
......@@ -194,6 +199,10 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap outputs_;
AttributeMap attrs_;
// OpInfo
const OpInfo* info_;
// Whether this operator executes in an Executor.
bool run_by_executor_{true};
......@@ -444,7 +453,7 @@ class OperatorWithKernel : public OperatorBase {
}
virtual void InferShape(InferShapeContext* ctx) const {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
Info().infer_shape_(ctx);
}
void RuntimeInferShape(const Scope& scope, const platform::Place& place,
......
......@@ -29,15 +29,6 @@ DEFINE_bool(
"Delete local scope eagerly. It will reduce GPU memory usage but "
"slow down the destruction of variables.(around 1% performance harm)");
DEFINE_double(
eager_delete_tensor_gb, -1.0,
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0");
DEFINE_bool(fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends.");
// When in inference scenario, the scopes will not be written by two threads in
// a mean time, but a scope may be read by multiple threads concurrently, and
// the mutex will cause serious performance issue.
......@@ -57,15 +48,6 @@ DEFINE_bool(fast_eager_deletion_mode, true,
namespace paddle {
namespace framework {
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1
: static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
(static_cast<int64_t>(1) << 30));
}
bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const {
......
......@@ -32,9 +32,6 @@ extern "C" {
namespace paddle {
namespace framework {
int64_t GetEagerDeletionThreshold();
bool IsFastEagerDeletionModeEnabled();
class Scope;
/**
......
......@@ -30,6 +30,7 @@ class InferShapeContext;
class InferVarTypeContext;
class BlockDesc;
class Variable;
class NoNeedBufferVarsInference;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
// TODO(panyx0718): Replace vector with something like gtl::Vector.
......@@ -61,5 +62,9 @@ using InferShapeFN = std::function<void(InferShapeContext*)>;
using InplacePair = std::unordered_map<std::string, std::string>;
using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
const AttributeMap& /*attrs*/)>;
} // namespace framework
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/add_position_encoding_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -39,13 +40,8 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Out@GRAD must not be null.");
auto out_dims = ctx->GetInputDim("Out");
if (ctx->HasOutput(framework::GradVarName("X"))) {
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
}
}
......@@ -75,6 +71,22 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
}
};
class AddPositionEncodingGradOpDescMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("add_position_encoding_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
......@@ -83,7 +95,7 @@ namespace plt = paddle::platform;
REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
ops::AddPositionEncodingOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::AddPositionEncodingGradOpDescMaker);
REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
REGISTER_OP_CPU_KERNEL(
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/clip_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -76,12 +77,28 @@ class ClipOpGrad : public framework::OperatorWithKernel {
}
};
class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("clip_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::ClipGradOpDescMaker);
REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
REGISTER_OP_CPU_KERNEL(
clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include <memory>
#include <string>
#include <vector>
......@@ -120,11 +121,7 @@ Examples:
class ConcatOpGrad : public framework::OperatorWithKernel {
public:
ConcatOpGrad(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
auto in_x = "X";
......@@ -142,6 +139,33 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
}
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.GetPlace());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference,
"X");
class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("concat_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
......@@ -149,9 +173,9 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
paddle::framework::DefaultGradOpDescMaker<
false> /* set false to disable empty grad */);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
ops::ConcatGradOpDescMaker);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
ops::ConcatOpGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL(
concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -455,13 +455,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
return type;
}
class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
class Conv2DGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType(GradOpType());
op->SetType(this->ForwardOpType() + "_grad");
op->SetInput("Input", Input("Input"));
op->SetInput("Filter", Input("Filter"));
op->SetInput("Bias", Input("Bias"));
......@@ -470,14 +470,33 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op);
}
};
class Conv3DGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
virtual std::string GradOpType() const {
return this->ForwardOpType() + "_grad";
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType(this->ForwardOpType() + "_grad");
op->SetInput("Input", Input("Input"));
op->SetInput("Filter", Input("Filter"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
if (ForwardOp().Inputs().count("ResidualData") != 0) {
op->SetInput("ResidualData", Input("ResidualData"));
}
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op);
}
};
......@@ -486,17 +505,16 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
ops::ConvOpInferVarType, ops::Conv2dGradMaker);
ops::ConvOpInferVarType, ops::Conv2DGradMaker);
REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
ops::ConvOpInferVarType, ops::Conv2dGradMaker);
ops::ConvOpInferVarType, ops::Conv2DGradMaker);
REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
ops::ConvOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::ConvOpInferVarType, ops::Conv3DGradMaker);
REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
// depthwise conv kernel
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/crop_op.h"
#include <boost/lexical_cast.hpp>
#include <memory>
#include <string>
#include <vector>
namespace paddle {
namespace operators {
......@@ -178,12 +180,31 @@ class CropOpGrad : public framework::OperatorWithKernel {
}
};
class CropGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("crop_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("X", Input("X"));
if (ForwardOp().Inputs().count("Offsets") > 0) {
op->SetInput("Offsets", Input("Offsets"));
}
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::CropGradOpDescMaker);
REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
REGISTER_OP_CPU_KERNEL(
crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -238,6 +238,23 @@ class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
}
};
class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("cross_entropy_grad");
op->SetInput("X", Input("X"));
op->SetInput("Label", Input("Label"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
class CrossEntropyOp2 : public CrossEntropyOpBase {
public:
using CrossEntropyOpBase::CrossEntropyOpBase;
......@@ -354,7 +371,7 @@ using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::CrossEntropyGradOpDescMaker);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
ops::CrossEntropyOpKernel<CPUCtx, double>);
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
......@@ -170,11 +171,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("last_h"),
"Input(last_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("last_c"),
"Input(last_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Cache"),
"Input(last_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitH"),
......@@ -197,6 +193,35 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
}
};
class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("cudnn_lstm_grad");
op->SetInput("Input", Input("Input"));
op->SetInput("InitH", Input("InitH"));
op->SetInput("InitC", Input("InitC"));
op->SetInput("W", Input("W"));
if (ForwardOp().Inputs().count("Cache") > 0) {
op->SetInput("Cache", Input("Cache"));
}
op->SetInput("Out", Output("Out"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c"));
op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH"));
op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC"));
op->SetAttrMap(Attrs());
return op;
}
};
template <typename T>
class NotImpleKernel : public framework::OpKernel<T> {
public:
......@@ -211,7 +236,7 @@ class NotImpleKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::CudnnLSTMGradOpDescMaker);
REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
......
......@@ -14,6 +14,7 @@
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
......@@ -218,7 +219,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
boost::get<platform::CUDAPlace>(id_tensor.place()),
id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
stream);
for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
for (int64_t i = 0; i < cpu_tensor.numel(); ++i) {
ids_vector.push_back(cpu_tensor_data[i]);
}
#endif
......
......@@ -16,8 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace ops = paddle::operators;
REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
"X");
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y");
REGISTER_OP_CPU_KERNEL(
elementwise_add,
......
......@@ -272,12 +272,11 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference {
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
} // namespace operators
} // namespace paddle
/*
*/
#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \
class kernel_type##GradMaker \
: public paddle::framework::SingleGradOpDescMaker { \
......@@ -311,18 +310,19 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference {
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace, \
::paddle::operators::ElementwiseGradNoBufVarsInference)
......@@ -16,8 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace ops = paddle::operators;
REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
"X");
REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y");
REGISTER_OP_CPU_KERNEL(
elementwise_sub,
......
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/gather_op.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace paddle {
......@@ -59,8 +62,9 @@ class GatherGradOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context());
return framework::OpKernelType(
ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
}
};
......@@ -94,13 +98,34 @@ Out = [[3, 4],
)DOC");
}
};
class GatherGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("gather_grad");
op->SetInput("Index", Input("Index"));
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherGradNoNeedBufferVarInference, "X");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
ops::GatherGradOpDescMaker);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
ops::GatherGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<uint8_t>,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lod_reset_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -146,18 +147,39 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.device_context());
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
}
};
class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("lod_reset_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("X", Input("X"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
"X");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
ops::LoDResetGradDescMaker);
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
ops::LoDResetGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL(
lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
......
......@@ -21,6 +21,7 @@
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_map>
......@@ -152,7 +153,7 @@ class CTRReader : public framework::FileReader {
queue_->ReOpen();
VLOG(3) << "reopen success";
VLOG(3) << "thread_num " << thread_num_;
for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
read_threads_.emplace_back(new std::thread(std::bind(
&ReadThread, file_groups_[thread_id], data_desc_,
static_cast<int>(thread_id), &read_thread_status_, queue_)));
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -139,6 +140,7 @@ PYBIND11_MODULE(core, m) {
paddle::platform::CpuTotalPhysicalMemory();
paddle::memory::allocation::UseAllocatorStrategyGFlag();
m.doc() = "C++ core of PaddlePaddle";
// using framework in this function. Since it is inside a function, it will
......@@ -153,6 +155,11 @@ PYBIND11_MODULE(core, m) {
return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
});
// NOTE(zjl): ctest would load environment variables at the beginning even
// though we have not `import paddle.fluid as fluid`. So we add this API
// to enable eager deletion mode in unittest.
m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
m.add_object("_cleanup",
py::capsule([]() { ScopePool::Instance().Clear(); }));
......@@ -281,6 +288,8 @@ PYBIND11_MODULE(core, m) {
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer(
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
.def("_is_initialized",
[](const Tensor &self) { return self.IsInitialized(); })
.def("_get_dims",
[](const Tensor &self) { return vectorize(self.dims()); })
.def("_set_dims",
......@@ -681,7 +690,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("drop_kids", &Scope::DropKids,
R"DOC(
Delete all sub-scopes of the current scope.
)DOC");
)DOC")
.def("_kids", &Scope::kids);
m.def("Scope",
[]() -> Scope * {
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
os.environ['FLAGS_use_ngraph'] = '0'
os.environ['FLAGS_use_mkldnn'] = '0'
os.environ['CPU_NUM'] = '4'
import paddle.fluid as fluid
import six
import unittest
import multiprocessing
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
def simple_fc_net():
image = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = image
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
optimizer.minimize(loss)
return image, label, loss
def get_persistables_and_non_persistables(prog, fetch_list):
num_block = prog.num_blocks
persitables = set()
non_persistables = set()
for bid in six.moves.range(num_block):
block = prog.block(bid)
for _, var in block.vars.items():
if var.persistable or var.name in fetch_list:
persitables.add(var.name)
else:
non_persistables.add(var.name)
return persitables, non_persistables
class TestExecutor(unittest.TestCase):
def test_executor_main(self):
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.place = p
with fluid.program_guard(fluid.Program(), fluid.Program()):
with fluid.scope_guard(fluid.Scope()):
with fluid.unique_name.guard():
self.executor_main()
for p in places:
self.place = p
with fluid.program_guard(fluid.Program(), fluid.Program()):
with fluid.scope_guard(fluid.Scope()):
with fluid.unique_name.guard():
self.pe_main()
def prepare_feed(self, image, label, dev_cnt=1):
batch_size = 32 * dev_cnt
image_shape = (batch_size, ) + tuple(image.shape[1:])
label_shape = (batch_size, ) + tuple(label.shape[1:])
image_np = np.random.random(size=image_shape).astype('float32')
label_np = np.random.random_integers(
low=0, high=9, size=label_shape).astype('int64')
return image_np, label_np
def assertScopeVar(self, scope, persitables, non_persistables):
outline_p_vars = []
for name in persitables:
var = scope.find_var(name)
self.assertTrue(var is not None)
t = var.get_tensor()
if not t._is_initialized():
outline_p_vars.append(name)
outline_np_vars = []
for name in non_persistables:
var = scope.find_var(name)
self.assertTrue(var is not None)
t = var.get_tensor()
if t._is_initialized():
outline_np_vars.append(name)
print('Non-alive persistable vars {} in {}'.format(outline_p_vars,
persitables))
print('Alive non-persistable vars {} in {}'.format(outline_np_vars,
non_persistables))
self.assertEqual(len(outline_p_vars), 0)
self.assertEqual(len(outline_np_vars), 0)
def executor_main(self):
image, label, loss = simple_fc_net()
loss.persistable = False
persistables, non_persistables = get_persistables_and_non_persistables(
fluid.default_main_program(), [loss.name])
print('Non-persistable var number {}'.format(len(non_persistables)))
print(non_persistables)
exe = fluid.Executor(self.place)
exe.run(fluid.default_startup_program())
p = fluid.core.Place()
p.set_place(self.place)
exe = fluid.core.Executor(p)
for _ in six.moves.range(10):
image_np, label_np = self.prepare_feed(image, label)
fluid.global_scope().var(image.name).get_tensor().set(image_np,
self.place)
fluid.global_scope().var(label.name).get_tensor().set(label_np,
self.place)
# exe.run would not create local scope
# so that we can detect whether gc clears temporary variables
exe.run(fluid.default_main_program().desc,
fluid.global_scope(), 0, False, True, [loss.name])
self.assertScopeVar(fluid.global_scope(), persistables,
non_persistables)
def pe_main(self):
image, label, loss = simple_fc_net()
loss.persistable = False
persitables, non_persistables = get_persistables_and_non_persistables(
fluid.default_main_program(), [loss.name])
exe = fluid.Executor(self.place)
exe.run(fluid.default_startup_program())
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_iteration_per_drop_scope = 100
build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
prog = fluid.CompiledProgram(fluid.default_main_program(
)).with_data_parallel(
loss_name=loss.name, exec_strategy=exec_strategy)
dev_cnt = fluid.core.get_cuda_device_count() if isinstance(self.place, fluid.CUDAPlace) \
else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
for idx in six.moves.range(10):
image_np, label_np = self.prepare_feed(image, label, dev_cnt)
feed = {image.name: image_np, label.name: label_np}
exe.run(program=prog, feed=feed, fetch_list=[loss])
local_scopes = prog._local_scopes
for scope in local_scopes:
kids = scope._kids()
self.assertTrue(len(kids) == 1)
self.assertScopeVar(kids[0], persistables, non_persistables)
if __name__ == '__main__':
unittest.main()
......@@ -13,7 +13,6 @@
# limitations under the License.
import os
os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
os.environ['CPU_NUM'] = '2'
import six
......
......@@ -16,6 +16,8 @@ import unittest
from test_eager_deletion_dynamic_rnn_base import TestBase
import paddle.fluid as fluid
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
def gru_net(data,
label,
......
......@@ -16,6 +16,8 @@ from test_eager_deletion_dynamic_rnn_base import TestBase
import paddle.fluid as fluid
import unittest
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
def lstm_net(data,
label,
......
......@@ -14,7 +14,9 @@
import os
import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
import paddle.fluid as fluid
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
# FIXME(zjl): It seems that this unittest fails randomly
# when comparing all reduce last loss and reduce last loss
......
......@@ -14,7 +14,9 @@
import os
import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
import paddle.fluid as fluid
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'
......
......@@ -16,8 +16,6 @@ from __future__ import print_function
import os
os.environ['CPU_NUM'] = '2'
os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
import unittest
import paddle.fluid as fluid
......@@ -29,6 +27,8 @@ import paddle.fluid.compiler as compiler
import numpy
import multiprocessing
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
class TestEagerDeletionWhileOpBase(unittest.TestCase):
def test_main(self):
......
......@@ -14,11 +14,12 @@
import os
import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55"
import paddle.fluid as fluid
os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
from test_parallel_executor_transformer import TestTransformer
if __name__ == '__main__':
......
......@@ -168,3 +168,7 @@ class TestROIAlignOp(OpTest):
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册