未验证 提交 63d2d722 编写于 作者: L Leo Chen 提交者: GitHub

[new-exec] Ahead-Of-Time choosing kernel (#48789)

* add skip run

* alloc minimum memory

* skip check_size in Alloc

* skip check_size in Alloc

* skip check_size in Alloc

* fix cases when tensor is initialized or empty

* alloc empty output for place info

* add test

* increase timeout

* format code

* skip cpu

* add cudnn_deterministic

* fit for hostAlloc

* follow comments

* change check_size to fake_alloc
上级 1804f834
...@@ -136,7 +136,8 @@ class VariableCompatTensor ...@@ -136,7 +136,8 @@ class VariableCompatTensor
void* AllocateFrom(phi::Allocator* allocator, void* AllocateFrom(phi::Allocator* allocator,
phi::DataType dtype, phi::DataType dtype,
size_t requested_size = 0) override { size_t requested_size = 0,
bool fake_alloc = false) override {
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"VariableCompatTensor does not support `AllocateFrom` method.")); "VariableCompatTensor does not support `AllocateFrom` method."));
} }
......
...@@ -33,7 +33,8 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, ...@@ -33,7 +33,8 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
std::string* new_var_name, std::string* new_var_name,
std::vector<OpFuncNode>* op_func_nodes, std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope, bool use_local_scope,
bool is_fetch_v2) { bool is_fetch_v2,
bool skip_run) {
bool is_transferred = false; bool is_transferred = false;
auto* src_var_name = &var_name; auto* src_var_name = &var_name;
...@@ -48,7 +49,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, ...@@ -48,7 +49,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
is_fetch_v2); is_fetch_v2);
if (op) { if (op) {
RunAndConstructOpFuncNode( RunAndConstructOpFuncNode(
op, *src_var_name, *new_var_name, op_func_nodes); op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
} }
// update src_var_name // update src_var_name
src_var_name = new_var_name; src_var_name = new_var_name;
...@@ -64,7 +65,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, ...@@ -64,7 +65,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
scope_); scope_);
if (op) { if (op) {
RunAndConstructOpFuncNode( RunAndConstructOpFuncNode(
op, *src_var_name, *new_var_name, op_func_nodes); op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
} }
// update src_var_name // update src_var_name
src_var_name = new_var_name; src_var_name = new_var_name;
...@@ -79,7 +80,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, ...@@ -79,7 +80,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
*src_var_name, new_var_name, src_place, dst_place, var_scope_, scope_); *src_var_name, new_var_name, src_place, dst_place, var_scope_, scope_);
if (op) { if (op) {
RunAndConstructOpFuncNode( RunAndConstructOpFuncNode(
op, *src_var_name, *new_var_name, op_func_nodes); op, *src_var_name, *new_var_name, op_func_nodes, skip_run);
} }
is_transferred = true; is_transferred = true;
} }
...@@ -89,7 +90,8 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, ...@@ -89,7 +90,8 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
void DataTranferHelper::RunAndConstructShareNode( void DataTranferHelper::RunAndConstructShareNode(
const std::string& src_var_name, const std::string& src_var_name,
const std::string& dst_var_name, const std::string& dst_var_name,
std::vector<OpFuncNode>* op_func_nodes) { std::vector<OpFuncNode>* op_func_nodes,
bool skip_run) {
VariableNameMap in_name_map = {{"X", {src_var_name}}}; VariableNameMap in_name_map = {{"X", {src_var_name}}};
VariableNameMap out_name_map = {{"Out", {dst_var_name}}}; VariableNameMap out_name_map = {{"Out", {dst_var_name}}};
AttributeMap attr_map; AttributeMap attr_map;
...@@ -102,14 +104,16 @@ void DataTranferHelper::RunAndConstructShareNode( ...@@ -102,14 +104,16 @@ void DataTranferHelper::RunAndConstructShareNode(
VLOG(3) << string::Sprintf( VLOG(3) << string::Sprintf(
"Insert %s with %s -> %s.", op_type, src_var_name, dst_var_name); "Insert %s with %s -> %s.", op_type, src_var_name, dst_var_name);
RunAndConstructOpFuncNode(op, src_var_name, dst_var_name, op_func_nodes); RunAndConstructOpFuncNode(
op, src_var_name, dst_var_name, op_func_nodes, skip_run);
} }
void DataTranferHelper::RunAndConstructOpFuncNode( void DataTranferHelper::RunAndConstructOpFuncNode(
const std::shared_ptr<OperatorBase>& op, const std::shared_ptr<OperatorBase>& op,
const std::string& var_name, const std::string& var_name,
const std::string& new_var_name, const std::string& new_var_name,
std::vector<OpFuncNode>* new_op_func_nodes) { std::vector<OpFuncNode>* new_op_func_nodes,
bool skip_run) {
auto& op_type = op->Type(); auto& op_type = op->Type();
// 1. Construct RuntimeContext // 1. Construct RuntimeContext
...@@ -172,7 +176,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode( ...@@ -172,7 +176,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
phi::KernelContext phi_kernel_context; phi::KernelContext phi_kernel_context;
op_with_kernel->BuildPhiKernelContext( op_with_kernel->BuildPhiKernelContext(
runtime_context, dev_ctx, &phi_kernel_context); runtime_context, dev_ctx, &phi_kernel_context);
(*new_op_func_node.phi_kernel_)(&phi_kernel_context); if (!skip_run) {
(*new_op_func_node.phi_kernel_)(&phi_kernel_context);
} else {
FakeInitializeOutputs(new_op_func_node.phi_kernel_,
op_with_kernel->PhiKernelSignature(),
&phi_kernel_context);
}
} }
const phi::Place& place = dev_ctx->GetPlace(); const phi::Place& place = dev_ctx->GetPlace();
...@@ -425,7 +435,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, ...@@ -425,7 +435,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
VariableScope* var_scope, VariableScope* var_scope,
OpFuncNode* op_func_node, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* new_op_func_nodes, std::vector<OpFuncNode>* new_op_func_nodes,
bool use_local_scope) { bool use_local_scope,
bool skip_run) {
Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope() Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
: var_scope->GetMutableScope(); : var_scope->GetMutableScope();
...@@ -500,7 +511,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, ...@@ -500,7 +511,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
op_base->Type() == "fetch_v2"); op_base->Type() == "fetch_v2");
if (op) { if (op) {
data_transfer_helper.RunAndConstructOpFuncNode( data_transfer_helper.RunAndConstructOpFuncNode(
op, var_name, new_var_name, new_op_func_nodes); op, var_name, new_var_name, new_op_func_nodes, skip_run);
} }
is_transferred = true; is_transferred = true;
} else { } else {
...@@ -524,7 +535,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, ...@@ -524,7 +535,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
&new_var_name, &new_var_name,
new_op_func_nodes, new_op_func_nodes,
use_local_scope, use_local_scope,
op_base->Type() == "fetch_v2"); op_base->Type() == "fetch_v2",
skip_run);
} }
if (is_transferred) { if (is_transferred) {
...@@ -575,7 +587,8 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, ...@@ -575,7 +587,8 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
VariableValueMap* out_vars, VariableValueMap* out_vars,
VariableScope* var_scope, VariableScope* var_scope,
std::vector<OpFuncNode>* op_func_nodes, std::vector<OpFuncNode>* op_func_nodes,
framework::Scope* local_scope) { framework::Scope* local_scope,
bool skip_run) {
DataTranferHelper data_transfer_helper(place, var_scope, local_scope); DataTranferHelper data_transfer_helper(place, var_scope, local_scope);
for (auto& var_name_item : out_names) { for (auto& var_name_item : out_names) {
std::vector<Variable*>& vars = out_vars->at(var_name_item.first); std::vector<Variable*>& vars = out_vars->at(var_name_item.first);
...@@ -651,9 +664,9 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, ...@@ -651,9 +664,9 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
auto op = TransferDtype( auto op = TransferDtype(
var_name, &new_var_name, src_type, dst_type, var_scope, local_scope); var_name, &new_var_name, src_type, dst_type, var_scope, local_scope);
data_transfer_helper.RunAndConstructOpFuncNode( data_transfer_helper.RunAndConstructOpFuncNode(
op, var_name, new_var_name, op_func_nodes); op, var_name, new_var_name, op_func_nodes, skip_run);
data_transfer_helper.RunAndConstructShareNode( data_transfer_helper.RunAndConstructShareNode(
new_var_name, var_name, op_func_nodes); new_var_name, var_name, op_func_nodes, skip_run);
} }
} }
} }
......
...@@ -40,16 +40,19 @@ class DataTranferHelper { ...@@ -40,16 +40,19 @@ class DataTranferHelper {
std::string* new_var_name, std::string* new_var_name,
std::vector<OpFuncNode>* new_op_func_nodes, std::vector<OpFuncNode>* new_op_func_nodes,
bool use_local_scope, bool use_local_scope,
bool is_fetch_v2); bool is_fetch_v2,
bool skip_run = false);
void RunAndConstructShareNode(const std::string& src_var_name, void RunAndConstructShareNode(const std::string& src_var_name,
const std::string& dst_var_name, const std::string& dst_var_name,
std::vector<OpFuncNode>* op_func_nodes); std::vector<OpFuncNode>* op_func_nodes,
bool skip_run = false);
void RunAndConstructOpFuncNode(const std::shared_ptr<OperatorBase>& op, void RunAndConstructOpFuncNode(const std::shared_ptr<OperatorBase>& op,
const std::string& var_name, const std::string& var_name,
const std::string& new_var_name, const std::string& new_var_name,
std::vector<OpFuncNode>* op_func_nodes); std::vector<OpFuncNode>* op_func_nodes,
bool skip_run = false);
private: private:
platform::Place place_; platform::Place place_;
...@@ -64,7 +67,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, ...@@ -64,7 +67,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
VariableScope* var_scope, VariableScope* var_scope,
OpFuncNode* op_func_node, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* op_func_nodes, std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope = true); bool use_local_scope = true,
bool skip_run = false);
void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
const platform::Place& place, const platform::Place& place,
...@@ -72,7 +76,8 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, ...@@ -72,7 +76,8 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node,
VariableValueMap* out_vars, VariableValueMap* out_vars,
VariableScope* var_scope, VariableScope* var_scope,
std::vector<OpFuncNode>* op_func_nodes, std::vector<OpFuncNode>* op_func_nodes,
framework::Scope* local_scope); framework::Scope* local_scope,
bool skip_run = false);
inline bool need_device_transform(const OpKernelType& kernel_type_for_var, inline bool need_device_transform(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) { const OpKernelType& expected_kernel_key) {
......
...@@ -38,6 +38,11 @@ PADDLE_DEFINE_EXPORTED_bool( ...@@ -38,6 +38,11 @@ PADDLE_DEFINE_EXPORTED_bool(
false, false,
"Log memory stats after each op runs, just used for debug."); "Log memory stats after each op runs, just used for debug.");
PADDLE_DEFINE_EXPORTED_bool(
new_executor_static_build,
false,
"Build the interpreterCore statically without running.");
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(check_nan_inf); DECLARE_bool(check_nan_inf);
...@@ -157,6 +162,33 @@ bool IsMemcpyOp(const Instruction& instr) { ...@@ -157,6 +162,33 @@ bool IsMemcpyOp(const Instruction& instr) {
return IsMemcpyD2H(instr) || IsMemcpyH2D(instr); return IsMemcpyD2H(instr) || IsMemcpyH2D(instr);
} }
bool IsBlockContainsOnlyPhiKernel(const framework::BlockDesc& block) {
bool res = true;
for (auto& op : block.AllOps()) {
auto op_type = op->Type();
if (op_type == "feed" || op_type == "fetch_v2") {
continue;
}
auto has_phi_kernel =
!phi::KernelFactory::Instance()
.SelectKernelMap(phi::TransToPhiKernelName(op_type))
.empty();
if (!has_phi_kernel) {
auto kernel_iter = OperatorWithKernel::AllOpKernels().find(op_type);
if (kernel_iter != OperatorWithKernel::AllOpKernels().end()) {
VLOG(4) << op_type << " has no phi kernel, but has fluid kernel.";
res = false;
} else {
VLOG(4) << op_type << " has no phi kernel, and no fluid kernel.";
}
} else {
VLOG(4) << op_type << " has phi kernel";
}
}
return res;
}
void AddFetch(const std::vector<std::string>& fetch_names, void AddFetch(const std::vector<std::string>& fetch_names,
framework::BlockDesc* block) { framework::BlockDesc* block) {
auto* fetch_holder = block->Var(kFetchVarName); auto* fetch_holder = block->Var(kFetchVarName);
...@@ -476,7 +508,66 @@ void HandleOperatorBase(const platform::Place& place, ...@@ -476,7 +508,66 @@ void HandleOperatorBase(const platform::Place& place,
op_func_node->dev_ctx_ = dev_ctx; op_func_node->dev_ctx_ = dev_ctx;
} }
void BuildOpFuncList(const platform::Place& place, void FakeInitializeOutputs(phi::Kernel* phi_kernel,
phi::KernelSignature* kernel_sig,
phi::KernelContext* phi_kernel_context) {
auto output_defs = phi_kernel->args_def().output_defs();
auto out_names = kernel_sig->output_names;
for (size_t i = 0; i < out_names.size(); ++i) {
VLOG(4) << out_names[i];
// calcute the start and end index of the output tensors
size_t start_idx = phi_kernel_context->OutputRangeAt(i).first;
size_t end_idx = phi_kernel_context->OutputRangeAt(i).second;
for (size_t j = start_idx; j < end_idx; ++j) {
auto* out_tensor = phi_kernel_context->MutableOutputAt(j);
if (out_tensor == nullptr) {
VLOG(4) << "Output" << out_names[i] << " is nullptr";
continue;
}
auto backend = output_defs[j].backend;
auto* dev_ctx =
&(phi_kernel_context->GetDeviceContext<phi::DeviceContext>());
if (phi::DenseTensor::classof(out_tensor)) {
if (!out_tensor->initialized()) {
VLOG(4) << "DenseTensor fake alloc 0 bytes of type "
<< out_tensor->dtype() << " on backend " << backend << " "
<< out_tensor;
if (backend == phi::TransToPhiBackend(dev_ctx->GetPlace())) {
dev_ctx->Alloc(out_tensor,
out_tensor->dtype(),
/*requested_size=*/0,
/*pinned=*/false,
/*fake_alloc=*/true);
} else {
if (backend == phi::Backend::CPU ||
backend == phi::Backend::ONEDNN) {
dev_ctx->HostAlloc(out_tensor,
out_tensor->dtype(),
/*requested_size=*/0,
/*fake_alloc=*/true);
}
}
}
} else if (phi::SparseCooTensor::classof(out_tensor)) {
// todo
VLOG(4) << "SparseCooTensor";
} else if (phi::SparseCsrTensor::classof(out_tensor)) {
// todo
VLOG(4) << "SparseCsrTensor";
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Only support "
"DenseTensor/SparseCooTensor/SparseCsrTensor "
"now"));
VLOG(4) << "SparseCooTensor";
}
}
}
}
bool BuildOpFuncList(const platform::Place& place,
const framework::BlockDesc& block, const framework::BlockDesc& block,
const std::set<std::string>& skip_gc_vars, const std::set<std::string>& skip_gc_vars,
std::vector<OpFuncNode>* vec_func_list, std::vector<OpFuncNode>* vec_func_list,
...@@ -490,6 +581,10 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -490,6 +581,10 @@ void BuildOpFuncList(const platform::Place& place,
// Step 1: create all ops for current block. // Step 1: create all ops for current block.
CreateAllOps(block, &ops_unique); CreateAllOps(block, &ops_unique);
auto skip_run =
FLAGS_new_executor_static_build && IsBlockContainsOnlyPhiKernel(block);
VLOG(4) << "Static build: " << skip_run;
if (!execution_config.used_for_jit) { if (!execution_config.used_for_jit) {
// If gc is enabled and block size > 1 // If gc is enabled and block size > 1
const ProgramDesc& main_program = *block.Program(); const ProgramDesc& main_program = *block.Program();
...@@ -676,6 +771,7 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -676,6 +771,7 @@ void BuildOpFuncList(const platform::Place& place,
} }
} }
} }
VLOG(4) << "if run phi kernel? : " << run_phi_kernel; VLOG(4) << "if run phi kernel? : " << run_phi_kernel;
if (!run_phi_kernel) { if (!run_phi_kernel) {
op_with_kernel->ChooseKernel(exec_ctx); op_with_kernel->ChooseKernel(exec_ctx);
...@@ -704,12 +800,14 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -704,12 +800,14 @@ void BuildOpFuncList(const platform::Place& place,
var_scope, var_scope,
&op_func_node, &op_func_node,
vec_func_list, vec_func_list,
use_local_scope); use_local_scope,
skip_run);
VLOG(4) << "apply data transform done. "; VLOG(4) << "apply data transform done. ";
// step 4. infershape, see OperatorWithKernel::RunImpl in operator.cc // step 4. infershape, see OperatorWithKernel::RunImpl in operator.cc
// for why. // for why.
if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) && if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) { op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
VLOG(4) << "infer shape";
InterpretercoreInferShapeContext infer_shape_ctx(*op, InterpretercoreInferShapeContext infer_shape_ctx(*op,
runtime_context); runtime_context);
// TODO(Aurelius84): In case of control flow ops, they are NOT // TODO(Aurelius84): In case of control flow ops, they are NOT
...@@ -722,11 +820,21 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -722,11 +820,21 @@ void BuildOpFuncList(const platform::Place& place,
phi::KernelContext phi_kernel_context; phi::KernelContext phi_kernel_context;
op_with_kernel->BuildPhiKernelContext( op_with_kernel->BuildPhiKernelContext(
runtime_context, dev_ctx, &phi_kernel_context); runtime_context, dev_ctx, &phi_kernel_context);
(*op_func_node.phi_kernel_)(&phi_kernel_context); if (!skip_run) {
(*op_func_node.phi_kernel_)(&phi_kernel_context);
} else {
FakeInitializeOutputs(op_func_node.phi_kernel_,
op_with_kernel->PhiKernelSignature(),
&phi_kernel_context);
}
} else { } else {
// the place of exec_ctx maybe has changed. // the place of exec_ctx maybe has changed.
op_func_node.kernel_func_(ExecutionContext( if (!skip_run) {
*op_with_kernel, *runtime_scope, *dev_ctx, runtime_context)); op_func_node.kernel_func_(ExecutionContext(
*op_with_kernel, *runtime_scope, *dev_ctx, runtime_context));
} else {
// TODO(zhiqiu): is it needed to support fluid kernel?
}
} }
// post-process grad_op.outputs if need cast complex grad into real // post-process grad_op.outputs if need cast complex grad into real
...@@ -812,6 +920,7 @@ void BuildOpFuncList(const platform::Place& place, ...@@ -812,6 +920,7 @@ void BuildOpFuncList(const platform::Place& place,
interpreter::LogDeviceMemoryStats(place); interpreter::LogDeviceMemoryStats(place);
} }
return skip_run;
} }
void LogDeviceMemoryStats(const platform::Place& place) { void LogDeviceMemoryStats(const platform::Place& place) {
......
...@@ -82,7 +82,7 @@ bool IsSupportedHeterPlace(const phi::Place& place); ...@@ -82,7 +82,7 @@ bool IsSupportedHeterPlace(const phi::Place& place);
void AddFetch(const std::vector<std::string>& fetch_names, void AddFetch(const std::vector<std::string>& fetch_names,
framework::BlockDesc* block); framework::BlockDesc* block);
void BuildOpFuncList(const platform::Place& place, bool BuildOpFuncList(const platform::Place& place,
const framework::BlockDesc& block, const framework::BlockDesc& block,
const std::set<std::string>& skip_gc_vars, const std::set<std::string>& skip_gc_vars,
std::vector<OpFuncNode>* vec_func_list, std::vector<OpFuncNode>* vec_func_list,
...@@ -96,6 +96,10 @@ void BuildVariableScope(const framework::BlockDesc& block, ...@@ -96,6 +96,10 @@ void BuildVariableScope(const framework::BlockDesc& block,
void LogDeviceMemoryStats(const platform::Place& place); void LogDeviceMemoryStats(const platform::Place& place);
void FakeInitializeOutputs(phi::Kernel* phi_kernel,
phi::KernelSignature* kernel_sig,
phi::KernelContext* phi_kernel_context);
} // namespace interpreter } // namespace interpreter
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -188,6 +188,35 @@ interpreter::CostInfo InterpreterCore::DryRun( ...@@ -188,6 +188,35 @@ interpreter::CostInfo InterpreterCore::DryRun(
return cost_info; return cost_info;
} }
void InterpreterCore::RunImpl() {
// For the program that only run once, it is no need to
// create work_queue, so the async_work_queue_ is created
// until the second step run.
async_work_queue_ = GetWorkQueue();
// lazy initialization of gc, do not create gc is the program only run once
if (!gc_) {
gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
}
if (execution_config_.used_for_jit && (sync_op_num_ == 0)) {
VLOG(4) << "Tracing Instruction List";
TraceInstructionList(vec_instruction_);
} else {
ExecuteInstructionList(vec_instruction_);
}
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
}
paddle::framework::FetchList InterpreterCore::Run( paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names, const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors) { const std::vector<phi::DenseTensor>& feed_tensors) {
...@@ -201,33 +230,9 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -201,33 +230,9 @@ paddle::framework::FetchList InterpreterCore::Run(
Prepare(feed_names, feed_tensors, is_build); Prepare(feed_names, feed_tensors, is_build);
if (is_build) { if (is_build) {
// For the program that only run once, it is no need to RunImpl();
// create work_queue, so the async_work_queue_ is created
// until the second step run.
async_work_queue_ = GetWorkQueue();
// lazy initialization of gc, do not create gc is the program only run once
if (!gc_) {
gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
}
if (execution_config_.used_for_jit && (sync_op_num_ == 0)) {
VLOG(4) << "Tracing Instruction List";
TraceInstructionList(vec_instruction_);
} else {
ExecuteInstructionList(vec_instruction_);
}
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
} }
if (HasLocalScope()) { if (HasLocalScope()) {
ClearLoDTensorArrayInLocalScope(); ClearLoDTensorArrayInLocalScope();
} }
...@@ -255,7 +260,7 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -255,7 +260,7 @@ paddle::framework::FetchList InterpreterCore::Run(
block_, &var_scope_, HasLocalScope()); block_, &var_scope_, HasLocalScope());
std::vector<paddle::framework::OpFuncNode> op_func_nodes; std::vector<paddle::framework::OpFuncNode> op_func_nodes;
paddle::framework::interpreter::BuildOpFuncList( auto skip_run = paddle::framework::interpreter::BuildOpFuncList(
place_, place_,
block_, block_,
execution_config_.skip_gc_vars, execution_config_.skip_gc_vars,
...@@ -268,33 +273,12 @@ paddle::framework::FetchList InterpreterCore::Run( ...@@ -268,33 +273,12 @@ paddle::framework::FetchList InterpreterCore::Run(
Convert(&op_func_nodes); Convert(&op_func_nodes);
is_build_ = true; is_build_ = true;
UpdateSyncOpNum(); UpdateSyncOpNum();
} else { if (skip_run) {
// For the program that only run once, it is no need to VLOG(4) << "RUN impl";
// create work_queue, so the async_work_queue_ is created RunImpl();
// until the second step run.
async_work_queue_ = GetWorkQueue();
// lazy initialization of gc, do not create gc is the program only run once
if (!gc_) {
gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
}
if (execution_config_.used_for_jit && (sync_op_num_ == 0)) {
VLOG(4) << "Tracing Instruction List";
TraceInstructionList(vec_instruction_);
} else {
ExecuteInstructionList(vec_instruction_);
}
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
} }
#endif } else {
#ifdef PADDLE_WITH_CUSTOM_DEVICE RunImpl();
if (platform::is_custom_place(place_)) {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#endif
} }
if (HasLocalScope()) { if (HasLocalScope()) {
...@@ -1197,7 +1181,7 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names, ...@@ -1197,7 +1181,7 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
block_, &var_scope_, HasLocalScope()); block_, &var_scope_, HasLocalScope());
FeedInput(); FeedInput();
std::vector<paddle::framework::OpFuncNode> op_func_nodes; std::vector<paddle::framework::OpFuncNode> op_func_nodes;
paddle::framework::interpreter::BuildOpFuncList( auto skip_run = paddle::framework::interpreter::BuildOpFuncList(
place_, place_,
block_, block_,
execution_config_.skip_gc_vars, execution_config_.skip_gc_vars,
...@@ -1210,6 +1194,10 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names, ...@@ -1210,6 +1194,10 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
Convert(&op_func_nodes); Convert(&op_func_nodes);
UpdateSyncOpNum(); UpdateSyncOpNum();
is_build_ = true; is_build_ = true;
if (skip_run) {
VLOG(4) << "RUN impl";
RunImpl();
}
} }
// NOTE: Because feed_tensor will be GC after // NOTE: Because feed_tensor will be GC after
// paddle::framework::BuildOpFuncList, so we should // paddle::framework::BuildOpFuncList, so we should
......
...@@ -98,6 +98,7 @@ class InterpreterCore { ...@@ -98,6 +98,7 @@ class InterpreterCore {
void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names); void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
// execution // execution
void RunImpl();
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr); void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
void RunInstructionAsync(size_t instr_id); void RunInstructionAsync(size_t instr_id);
void RunInstruction(const Instruction& instr_node); void RunInstruction(const Instruction& instr_node);
......
...@@ -3028,6 +3028,7 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3028,6 +3028,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
(i == 0 ? 0 : phi_kernel_context->OutputRangeAt(i - 1).second); (i == 0 ? 0 : phi_kernel_context->OutputRangeAt(i - 1).second);
if (it == ctx.outputs.end() || it->second.empty()) { if (it == ctx.outputs.end() || it->second.empty()) {
VLOG(4) << "Output " << output_names[i] << " not found";
// Deal with the case that some outputs are not found or be NULL when run // Deal with the case that some outputs are not found or be NULL when run
// the kernel. // the kernel.
// For example : the outputs of matmul_grad are dx and dy, // For example : the outputs of matmul_grad are dx and dy,
...@@ -3073,6 +3074,7 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -3073,6 +3074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
framework::ToTypeName(var->Type()))); framework::ToTypeName(var->Type())));
} }
} else { } else {
VLOG(4) << "Output " << output_names[i] << " is nullptr";
phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
} }
......
...@@ -553,6 +553,7 @@ REGISTER_OPERATOR(batch_norm, ...@@ -553,6 +553,7 @@ REGISTER_OPERATOR(batch_norm,
ops::BatchNormOpInferVarType, ops::BatchNormOpInferVarType,
ops::BatchNormGradMaker<paddle::framework::OpDesc>, ops::BatchNormGradMaker<paddle::framework::OpDesc>,
ops::BatchNormGradMaker<paddle::imperative::OpBase>); ops::BatchNormGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(batch_norm_grad, REGISTER_OPERATOR(batch_norm_grad,
ops::BatchNormGradOp, ops::BatchNormGradOp,
ops::BatchNormDoubleGradMaker<paddle::framework::OpDesc>, ops::BatchNormDoubleGradMaker<paddle::framework::OpDesc>,
......
...@@ -98,7 +98,8 @@ bool DenseTensor::IsSharedWith(const DenseTensor& b) const { ...@@ -98,7 +98,8 @@ bool DenseTensor::IsSharedWith(const DenseTensor& b) const {
void* DenseTensor::AllocateFrom(Allocator* allocator, void* DenseTensor::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
bool fake_alloc) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
allocator, allocator,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
...@@ -107,21 +108,28 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, ...@@ -107,21 +108,28 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype; VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype;
meta_.dtype = dtype; meta_.dtype = dtype;
} }
PADDLE_ENFORCE(
valid(),
phi::errors::PreconditionNotMet(
"The meta data must be valid when call the mutable data function."));
size_t bytes = numel() * SizeOf(this->dtype()); size_t bytes = numel() * SizeOf(this->dtype());
if (requested_size) {
PADDLE_ENFORCE_GE(requested_size, if (fake_alloc) {
bytes, bytes = 0;
phi::errors::InvalidArgument( } else {
"The reserved size %d should be enough to meet the " PADDLE_ENFORCE(
"volume required by metadata %d.", valid(),
requested_size, phi::errors::PreconditionNotMet("The meta data must be valid when "
bytes)); "call the mutable data function."));
bytes = requested_size; if (requested_size) {
PADDLE_ENFORCE_GE(requested_size,
bytes,
phi::errors::InvalidArgument(
"The reserved size %d should be enough to meet the "
"volume required by metadata %d.",
requested_size,
bytes));
bytes = requested_size;
}
} }
// NOTE(paddle-dev): In case of the allocator of storage_ is different with // NOTE(paddle-dev): In case of the allocator of storage_ is different with
// the incoming allocator, we will re-alloc data using the incoming // the incoming allocator, we will re-alloc data using the incoming
// allocator. See DeviceContext.Alloc in core/device_context.cc. // allocator. See DeviceContext.Alloc in core/device_context.cc.
......
...@@ -125,7 +125,8 @@ class DenseTensor : public TensorBase, ...@@ -125,7 +125,8 @@ class DenseTensor : public TensorBase,
/// \return The mutable data pointer value of type T. /// \return The mutable data pointer value of type T.
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
/// \brief Check if allocation is shared with other objects. /// \brief Check if allocation is shared with other objects.
/// \return Whether the allocation is shared with other objects. /// \return Whether the allocation is shared with other objects.
......
...@@ -134,7 +134,8 @@ struct DeviceContext::Impl { ...@@ -134,7 +134,8 @@ struct DeviceContext::Impl {
const Place& place, const Place& place,
DataType dtype = DataType::UNDEFINED, DataType dtype = DataType::UNDEFINED,
size_t requested_size = 0, size_t requested_size = 0,
bool pinned = false) const { bool pinned = false,
bool fake_alloc = false) const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
tensor, tensor,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
...@@ -148,9 +149,10 @@ struct DeviceContext::Impl { ...@@ -148,9 +149,10 @@ struct DeviceContext::Impl {
if (tensor->initialized() && tensor->place() != place) { if (tensor->initialized() && tensor->place() != place) {
ClearHolder(tensor); ClearHolder(tensor);
} }
auto* allocator = tensor->numel() == 0 && requested_size == 0 auto* allocator =
? zero_allocator_ (tensor->numel() == 0 || fake_alloc) && requested_size == 0
: (pinned ? pinned_allocator_ : device_allocator_); ? zero_allocator_
: (pinned ? pinned_allocator_ : device_allocator_);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned; bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned;
if (must_cuda_graph_allocator && if (must_cuda_graph_allocator &&
...@@ -164,7 +166,7 @@ struct DeviceContext::Impl { ...@@ -164,7 +166,7 @@ struct DeviceContext::Impl {
} }
#endif #endif
return tensor->AllocateFrom( return tensor->AllocateFrom(
const_cast<Allocator*>(allocator), dtype, requested_size); const_cast<Allocator*>(allocator), dtype, requested_size, fake_alloc);
} }
template <typename T> template <typename T>
...@@ -178,7 +180,8 @@ struct DeviceContext::Impl { ...@@ -178,7 +180,8 @@ struct DeviceContext::Impl {
void* HostAlloc(TensorBase* tensor, void* HostAlloc(TensorBase* tensor,
DataType dtype = DataType::UNDEFINED, DataType dtype = DataType::UNDEFINED,
size_t requested_size = 0) const { size_t requested_size = 0,
bool fake_alloc = false) const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
tensor, tensor,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
...@@ -190,9 +193,11 @@ struct DeviceContext::Impl { ...@@ -190,9 +193,11 @@ struct DeviceContext::Impl {
ClearHolder(tensor); ClearHolder(tensor);
} }
auto* allocator = auto* allocator =
tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_; (tensor->numel() == 0 || fake_alloc) && requested_size == 0
? host_zero_allocator_
: host_allocator_;
return tensor->AllocateFrom( return tensor->AllocateFrom(
const_cast<Allocator*>(allocator), dtype, requested_size); const_cast<Allocator*>(allocator), dtype, requested_size, fake_alloc);
} }
template <typename T> template <typename T>
...@@ -342,12 +347,18 @@ const Allocator& DeviceContext::GetPinnedAllocator() const { ...@@ -342,12 +347,18 @@ const Allocator& DeviceContext::GetPinnedAllocator() const {
void* DeviceContext::Alloc(TensorBase* tensor, void* DeviceContext::Alloc(TensorBase* tensor,
DataType dtype, DataType dtype,
size_t requested_size, size_t requested_size,
bool pinned) const { bool pinned,
bool fake_alloc) const {
if (pinned) { if (pinned) {
return impl_->Alloc( return impl_->Alloc(tensor,
tensor, GetPinnedPlace(GetPlace()), dtype, requested_size, pinned); GetPinnedPlace(GetPlace()),
dtype,
requested_size,
pinned,
fake_alloc);
} }
return impl_->Alloc(tensor, GetPlace(), dtype, requested_size, pinned); return impl_->Alloc(
tensor, GetPlace(), dtype, requested_size, pinned, fake_alloc);
} }
template <typename T> template <typename T>
...@@ -363,8 +374,9 @@ T* DeviceContext::Alloc(TensorBase* tensor, ...@@ -363,8 +374,9 @@ T* DeviceContext::Alloc(TensorBase* tensor,
void* DeviceContext::HostAlloc(TensorBase* tensor, void* DeviceContext::HostAlloc(TensorBase* tensor,
DataType dtype, DataType dtype,
size_t requested_size) const { size_t requested_size,
return impl_->HostAlloc(tensor, dtype, requested_size); bool fake_alloc) const {
return impl_->HostAlloc(tensor, dtype, requested_size, fake_alloc);
} }
template <typename T> template <typename T>
......
...@@ -149,7 +149,8 @@ class PADDLE_API DeviceContext { ...@@ -149,7 +149,8 @@ class PADDLE_API DeviceContext {
void* Alloc(TensorBase*, void* Alloc(TensorBase*,
DataType dtype, DataType dtype,
size_t requested_size = 0, size_t requested_size = 0,
bool pinned = false) const; bool pinned = false,
bool fake_alloc = false) const;
template <typename T> template <typename T>
T* Alloc(TensorBase* tensor, T* Alloc(TensorBase* tensor,
...@@ -161,7 +162,8 @@ class PADDLE_API DeviceContext { ...@@ -161,7 +162,8 @@ class PADDLE_API DeviceContext {
*/ */
void* HostAlloc(TensorBase* tensor, void* HostAlloc(TensorBase* tensor,
DataType dtype, DataType dtype,
size_t requested_size = 0) const; size_t requested_size = 0,
bool fake_alloc = false) const;
template <typename T> template <typename T>
T* HostAlloc(TensorBase* tensor, size_t requested_size = 0) const; T* HostAlloc(TensorBase* tensor, size_t requested_size = 0) const;
......
...@@ -53,7 +53,8 @@ bool ExtendedTensor::initialized() const { ...@@ -53,7 +53,8 @@ bool ExtendedTensor::initialized() const {
void* ExtendedTensor::AllocateFrom(Allocator* allocator, void* ExtendedTensor::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
bool fake_alloc) {
PADDLE_THROW(phi::errors::Unavailable( PADDLE_THROW(phi::errors::Unavailable(
"ExtendedTensor does not support `AllocateFrom` method.")); "ExtendedTensor does not support `AllocateFrom` method."));
} }
......
...@@ -49,7 +49,8 @@ class ExtendedTensor : public TensorBase { ...@@ -49,7 +49,8 @@ class ExtendedTensor : public TensorBase {
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
}; };
} // namespace phi } // namespace phi
...@@ -119,6 +119,8 @@ class KernelContext { ...@@ -119,6 +119,8 @@ class KernelContext {
return static_cast<TensorType*>(outputs_.at(idx)); return static_cast<TensorType*>(outputs_.at(idx));
} }
TensorBase* MutableOutputAt(size_t idx) { return outputs_.at(idx); }
template <typename TensorType> template <typename TensorType>
std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) { std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) {
std::vector<TensorType*> v; std::vector<TensorType*> v;
......
...@@ -90,8 +90,9 @@ class SelectedRows : public TensorBase, ...@@ -90,8 +90,9 @@ class SelectedRows : public TensorBase,
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override { size_t requested_size = 0,
return impl_->AllocateFrom(allocator, dtype, requested_size); bool fake_alloc = false) override {
return impl_->AllocateFrom(allocator, dtype, requested_size, fake_alloc);
} }
/* /*
......
...@@ -94,8 +94,9 @@ struct TensorFillVisitor { ...@@ -94,8 +94,9 @@ struct TensorFillVisitor {
void* SelectedRowsImpl::AllocateFrom(Allocator* allocator, void* SelectedRowsImpl::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
return value_->AllocateFrom(allocator, dtype, requested_size); bool fake_alloc) {
return value_->AllocateFrom(allocator, dtype, requested_size, fake_alloc);
} }
bool SelectedRowsImpl::HasKey(int64_t key) const { bool SelectedRowsImpl::HasKey(int64_t key) const {
......
...@@ -109,7 +109,8 @@ class SelectedRowsImpl { ...@@ -109,7 +109,8 @@ class SelectedRowsImpl {
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0); size_t requested_size = 0,
bool fake_alloc = false);
/* /*
* @brief Get the index of the key from id_to_index_ map. If the key not * @brief Get the index of the key from id_to_index_ map. If the key not
......
...@@ -67,8 +67,10 @@ SparseCooTensor SparseCooTensor::operator=(const SparseCooTensor& other) { ...@@ -67,8 +67,10 @@ SparseCooTensor SparseCooTensor::operator=(const SparseCooTensor& other) {
void* SparseCooTensor::AllocateFrom(Allocator* allocator, void* SparseCooTensor::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
return non_zero_elements_.AllocateFrom(allocator, dtype, requested_size); bool fake_alloc) {
return non_zero_elements_.AllocateFrom(
allocator, dtype, requested_size, fake_alloc);
} }
int64_t SparseCooTensor::nnz() const { int64_t SparseCooTensor::nnz() const {
......
...@@ -170,7 +170,8 @@ class SparseCooTensor : public TensorBase, ...@@ -170,7 +170,8 @@ class SparseCooTensor : public TensorBase,
/// \brief This function is not recommended /// \brief This function is not recommended
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
/// \brief get the sparse dim /// \brief get the sparse dim
int32_t sparse_dim() const; int32_t sparse_dim() const;
......
...@@ -82,8 +82,10 @@ SparseCsrTensor& SparseCsrTensor::operator=(const SparseCsrTensor& other) { ...@@ -82,8 +82,10 @@ SparseCsrTensor& SparseCsrTensor::operator=(const SparseCsrTensor& other) {
void* SparseCsrTensor::AllocateFrom(Allocator* allocator, void* SparseCsrTensor::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
return non_zero_elements_.AllocateFrom(allocator, dtype, requested_size); bool fake_alloc) {
return non_zero_elements_.AllocateFrom(
allocator, dtype, requested_size, fake_alloc);
} }
void SparseCsrTensor::Resize(const DDim& dense_dims, void SparseCsrTensor::Resize(const DDim& dense_dims,
......
...@@ -62,7 +62,8 @@ class SparseCsrTensor : public TensorBase, ...@@ -62,7 +62,8 @@ class SparseCsrTensor : public TensorBase,
/// \brief This function is not recommended /// \brief This function is not recommended
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
public: public:
/// \brief Returns the name of the class for type traits. /// \brief Returns the name of the class for type traits.
......
...@@ -130,25 +130,32 @@ void StringTensor::init_holder() { ...@@ -130,25 +130,32 @@ void StringTensor::init_holder() {
void* StringTensor::AllocateFrom(Allocator* allocator, void* StringTensor::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
bool fake_alloc) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
allocator, allocator,
errors::InvalidArgument( errors::InvalidArgument(
"Required allocator shall not be nullptr, but received nullptr.")); "Required allocator shall not be nullptr, but received nullptr."));
PADDLE_ENFORCE(
valid(),
errors::PreconditionNotMet(
"The meta data must be valid when call the mutable data function."));
size_t bytes = numel() * SizeOf(this->dtype()); size_t bytes = numel() * SizeOf(this->dtype());
if (requested_size) { if (fake_alloc) {
PADDLE_ENFORCE_GE(requested_size, bytes = 0;
bytes, } else {
errors::InvalidArgument( PADDLE_ENFORCE(
"The reserved size %d should be enough to meet the " valid(),
"volume required by metadata %d.", errors::PreconditionNotMet("The meta data must be valid when call the "
requested_size, "mutable data function."));
bytes)); if (requested_size) {
bytes = requested_size; PADDLE_ENFORCE_GE(requested_size,
bytes,
errors::InvalidArgument(
"The reserved size %d should be enough to meet the "
"volume required by metadata %d.",
requested_size,
bytes));
bytes = requested_size;
}
} }
if (!holder_ || holder_->size() < bytes + meta_.offset) { if (!holder_ || holder_->size() < bytes + meta_.offset) {
......
...@@ -123,7 +123,8 @@ class StringTensor : public TensorBase, ...@@ -123,7 +123,8 @@ class StringTensor : public TensorBase,
} }
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
dtype::pstring* mutable_data(const phi::Place& place, dtype::pstring* mutable_data(const phi::Place& place,
size_t requested_size = 0); size_t requested_size = 0);
......
...@@ -65,9 +65,11 @@ bool TensorArray::valid() const { ...@@ -65,9 +65,11 @@ bool TensorArray::valid() const {
/// \return Void pointer /// \return Void pointer
void* TensorArray::AllocateFrom(Allocator* allocator, void* TensorArray::AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size) { size_t requested_size,
bool fake_allc) {
for (size_t i = 0; i < tensors_.size(); i++) { for (size_t i = 0; i < tensors_.size(); i++) {
tensors_[i].AllocateFrom(allocator, tensors_[i].dtype(), requested_size); tensors_[i].AllocateFrom(
allocator, tensors_[i].dtype(), requested_size, fake_allc);
} }
return nullptr; return nullptr;
} }
......
...@@ -83,7 +83,8 @@ class TensorArray : public TensorBase, ...@@ -83,7 +83,8 @@ class TensorArray : public TensorBase,
/// \return Void pointer /// \return Void pointer
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0,
bool fake_alloc = false) override;
bool empty() const { return tensors_.empty(); } bool empty() const { return tensors_.empty(); }
......
...@@ -66,7 +66,8 @@ class TensorBase { ...@@ -66,7 +66,8 @@ class TensorBase {
/// \return The mutable data pointer value of type T. /// \return The mutable data pointer value of type T.
virtual void* AllocateFrom(Allocator* allocator, virtual void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) = 0; size_t requested_size = 0,
bool fake_alloc = false) = 0;
/// \brief Return the type information of the derived class to support /// \brief Return the type information of the derived class to support
/// safely downcast in non-rtti environment. /// safely downcast in non-rtti environment.
......
...@@ -25,3 +25,5 @@ py_test_modules( ...@@ -25,3 +25,5 @@ py_test_modules(
FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat) FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat)
set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30) set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30)
set_tests_properties(test_standalone_executor_aot_choose_kernel
PROPERTIES TIMEOUT 60)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
from paddle.framework import set_flags
paddle.enable_static()
def build_resnet50():
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
image = paddle.static.data(
name='image', shape=[32, 3, 224, 224], dtype='float32'
)
label = paddle.static.data(name='label', shape=[32], dtype='int64')
model = paddle.vision.models.resnet50()
prediction = model(image)
loss = paddle.nn.functional.cross_entropy(input=prediction, label=label)
loss = paddle.mean(loss)
adam = paddle.optimizer.Adam(learning_rate=0.001)
adam.minimize(loss)
return main_program, startup_program, loss
class TestAOTChooseKernel(unittest.TestCase):
def test_aot_choose_kernel(self):
if not paddle.fluid.core.is_compiled_with_cuda():
return
def run(aot_choose_kernel=None):
paddle.seed(2022)
np.random.seed(2022)
main_program, startup_program, loss = build_resnet50()
scope = paddle.static.Scope()
exe = paddle.static.Executor()
set_flags({'FLAGS_cudnn_deterministic': 1})
if aot_choose_kernel:
set_flags({'FLAGS_new_executor_static_build': 1})
else:
set_flags({'FLAGS_new_executor_static_build': 0})
with paddle.static.scope_guard(scope):
exe.run(startup_program)
for i in range(10):
feed = {
'image': np.random.randint(
0, 256, size=[32, 3, 224, 224]
).astype('float32'),
'label': np.random.randint(0, 1000, size=[32]).astype(
'int64'
),
}
loss_ = exe.run(main_program, feed=feed, fetch_list=[loss])
return loss_
loss1 = run(aot_choose_kernel=True)
loss2 = run(aot_choose_kernel=False)
self.assertEqual(loss1, loss2)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册