未验证 提交 75122319 编写于 作者: W WangZhen 提交者: GitHub

[JitLayer]Polish PEFuntion to speed up JitLayer and fix memory leak (#44738)

* Polish PEFuntion to speed up JitLayer

* Polish PEFunction code

* Fix comments
上级 212f015f
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/fluid/jit/base_function.h" #include "paddle/fluid/jit/base_function.h"
#include "paddle/fluid/jit/function_schema.h" #include "paddle/fluid/jit/function_schema.h"
...@@ -36,9 +37,14 @@ class ExecutorFunction : public BaseFunction { ...@@ -36,9 +37,14 @@ class ExecutorFunction : public BaseFunction {
const Name2VariableMap &params_dict, const Name2VariableMap &params_dict,
const phi::Place &place) const phi::Place &place)
: info_(info), place_(place), inner_exe_(place_) { : info_(info), place_(place), inner_exe_(place_) {
info_->RemoveDescFeedFetch();
PADDLE_ENFORCE_GT(
static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
0,
platform::errors::PreconditionNotMet(
"There is no operator in ProgramDesc."));
utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
info_->RemoveDescFeedFetch();
} }
~ExecutorFunction() noexcept {} ~ExecutorFunction() noexcept {}
...@@ -56,9 +62,9 @@ class ExecutorFunction : public BaseFunction { ...@@ -56,9 +62,9 @@ class ExecutorFunction : public BaseFunction {
false, false,
true, true,
info_->OutputArgNames()); info_->OutputArgNames());
std::vector<DenseTensor> res; std::vector<DenseTensor> outputs;
utils::FetchOuts(info_->OutputArgNames(), scope_, &res); utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
return res; return outputs;
} }
const std::shared_ptr<FunctionInfo> &Info() const { return info_; } const std::shared_ptr<FunctionInfo> &Info() const { return info_; }
......
...@@ -19,10 +19,14 @@ ...@@ -19,10 +19,14 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/executor_cache.h" #include "paddle/fluid/framework/executor_cache.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/fluid/jit/base_function.h" #include "paddle/fluid/jit/base_function.h"
#include "paddle/fluid/jit/function_schema.h" #include "paddle/fluid/jit/function_schema.h"
...@@ -31,72 +35,99 @@ ...@@ -31,72 +35,99 @@
namespace paddle { namespace paddle {
namespace jit { namespace jit {
using ExecutionStrategy = framework::details::ExecutionStrategy;
using ParallelExecutor = framework::ParallelExecutor;
using Graph = framework::ir::Graph;
class PEFunction : public BaseFunction { class PEFunction : public BaseFunction {
public: public:
PEFunction(const std::shared_ptr<FunctionInfo> &info, PEFunction(const std::shared_ptr<FunctionInfo> &info,
const Name2VariableMap &params_dict, const Name2VariableMap &params_dict,
const phi::Place &place) const phi::Place &place)
: info_(info), place_(place) { : info_(info), place_(place) {
info_->RemoveDescFeedFetch();
PADDLE_ENFORCE_GT(
static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
0,
platform::errors::PreconditionNotMet(
"There is no operator in ProgramDesc."));
utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
info_->RemoveDescFeedFetch(); CreateGraphAndPE();
} }
~PEFunction() noexcept {} ~PEFunction() noexcept {}
std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) { static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
auto dense_tensors = utils::ToDenseTensors(inputs); ExecutionStrategy execution_strategy;
return utils::ToTensors(this->operator()(dense_tensors));
auto device_type = platform::Place2DeviceType(place);
switch (device_type) {
case platform::DeviceType::CPU: {
execution_strategy.num_threads_ = 2;
break;
}
case platform::DeviceType::CUDA: {
// NOTE: According experiments, one thread is faster in
// most model training.
execution_strategy.num_threads_ = 1;
break;
}
case platform::DeviceType::XPU: {
execution_strategy.num_threads_ = 1;
break;
}
case platform::DeviceType::IPU: {
execution_strategy.num_threads_ = 1;
break;
}
default:
PADDLE_THROW(platform::errors::Unavailable(
"Unsupported Device type %d.", device_type));
}
execution_strategy.use_device_ = device_type;
return execution_strategy;
} }
std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) { void CreateGraphAndPE() {
std::string prog_string; framework::details::BuildStrategy build_strategy;
std::hash<std::string> string_hash; auto execution_strategy = GetExecutionStrategy(place_);
auto &program_desc = info_->ProgramDesc(); auto &program_desc = info_->ProgramDesc();
// TODO(dev): Serialize is very slow.
const_cast<framework::ProgramDesc *>(&program_desc)
->Proto()
->SerializePartialToString(&prog_string);
int64_t program_id = static_cast<int64_t>(string_hash(prog_string));
const framework::BlockDesc &global_block = program_desc.Block(0); const framework::BlockDesc &global_block = program_desc.Block(0);
int64_t start_op_index = 0; int64_t start_op_index = 0;
int64_t end_op_index = static_cast<int64_t>(global_block.OpSize()); int64_t end_op_index = static_cast<int64_t>(global_block.OpSize());
graph_ =
std::make_shared<Graph>(program_desc, start_op_index, end_op_index);
inner_pe_ = std::make_shared<ParallelExecutor>(
place_, &scope_, execution_strategy, build_strategy, graph_.get());
inner_pe_->PrepareVariables(&scope_);
inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames());
}
std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) {
auto dense_tensors = utils::ToDenseTensors(inputs);
return utils::ToTensors(this->operator()(dense_tensors));
}
std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) {
utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
std::vector<std::string> input_var_names = info_->InputArgNames();
std::vector<std::string> output_var_names = info_->OutputArgNames(); // update op_handle scope_map in pe->executor_->Graph
std::unordered_map<framework::Scope *, framework::Scope *> scope_map = {
if (end_op_index > start_op_index) { {inner_pe_->GetLocalScopes().front(), &scope_}};
auto cache_info = framework::GetExecutorInfoFromCache(program_desc, inner_pe_->ResetOpHandleScopeMapOfGraphs(scope_map);
place_, // need to recreate tmp variables in new scope
start_op_index, inner_pe_->PrepareVariables(&scope_);
end_op_index,
/*is_grad=*/false, inner_pe_->RunWithoutFetch(info_->OutputArgNames());
program_id,
&scope_); std::vector<DenseTensor> outputs;
auto &parallel_executor = cache_info.first; utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
auto &skip_eager_delete_vars = scope_.DropKids();
framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( return outputs;
program_id, false);
if (cache_info.second /*is_new_created*/) {
parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names);
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
output_var_names.begin(),
output_var_names.end());
framework::details::ParseSafeEagerDeletionSkipVars(
program_desc,
end_op_index,
output_var_names,
&skip_eager_delete_vars);
}
parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
}
std::vector<DenseTensor> res;
utils::FetchOuts(info_->OutputArgNames(), scope_, &res);
return res;
} }
const std::shared_ptr<FunctionInfo> &Info() const { return info_; } const std::shared_ptr<FunctionInfo> &Info() const { return info_; }
...@@ -105,6 +136,8 @@ class PEFunction : public BaseFunction { ...@@ -105,6 +136,8 @@ class PEFunction : public BaseFunction {
std::shared_ptr<FunctionInfo> info_; std::shared_ptr<FunctionInfo> info_;
framework::Scope scope_; framework::Scope scope_;
phi::Place place_; phi::Place place_;
std::shared_ptr<ParallelExecutor> inner_pe_;
std::shared_ptr<Graph> graph_;
}; };
} // namespace jit } // namespace jit
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册