未验证 提交 b39afb13 编写于 作者: C csy0225 提交者: GitHub

Optimize the ernie inference performance on xpu backend. (#50357)

* Optimize the ernie inference performance on xpu

* fix enable runtime cache logic

* when op's input shape has changed, should create a new runtime context

* fix

* set flag when input shape has changed
上级 1e7dc9c0
......@@ -1618,6 +1618,57 @@ void OperatorWithKernel::CheckWhetherPreparePhiData(
}
}
// When do we need to reset runtime context?
// 1. When enable cache runtime context, if the program runs for the first time,
// runtime_ctx_.get() == nullptr, we need to create a new runtime context.
// 2. When enable cache runtime context, if the program is not running for the
// first time,
// but the input shape or tensor layout of the operator has changed, we cannot
// use the runtime context stored in the cache at this time, and need to
// create a new one.
bool OperatorWithKernel::NeedResetRuntimeContext(const Scope& scope) const {
if (runtime_ctx_.get() == nullptr) return true;
const auto& name_map = Inputs();
for (auto& var_name_item : name_map) {
auto& name_vec = var_name_item.second;
std::vector<Variable*>& cache_input_vars =
runtime_ctx_->inputs[var_name_item.first];
PADDLE_ENFORCE_EQ(
name_vec.size(),
cache_input_vars.size(),
platform::errors::InvalidArgument(
"The size of input variable names (%d) must be equal to "
"the size of cache input variable ptrs (%d).",
name_vec.size(),
cache_input_vars.size()));
for (size_t i = 0; i < name_vec.size(); i++) {
auto var_name = name_vec[i];
auto* cache_input_var = cache_input_vars[i];
if (!VarIsTensor(*cache_input_var)) continue;
auto* cache_input_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(cache_input_var);
auto cache_input_tensor_dims = cache_input_tensor->dims();
auto* current_input_var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
current_input_var,
platform::errors::NotFound(
"The variable %s is not found when "
"enable_cache_runtime_context_cache in origin scope.",
var_name));
auto* current_input_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(current_input_var);
auto current_input_tensor_dims = current_input_tensor->dims();
if (cache_input_tensor_dims != current_input_tensor_dims ||
NeedTransformLayout(current_input_tensor->layout(),
cache_input_tensor->layout())) {
need_prepare_data_ = true;
return true;
}
}
}
return false;
}
void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const {
// To reduce the elapsed time of HasAttr, we use bool variable to record the
......@@ -1627,12 +1678,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
if (!all_kernels_must_compute_runtime_shape_ &&
HasAttr(kAllKernelsMustComputeRuntimeShape))
all_kernels_must_compute_runtime_shape_ = true;
const Scope* cur_scope = &scope;
CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
if (!enable_cache_runtime_context_) {
RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx);
pre_scope_ = cur_scope;
} else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
!need_prepare_phi_data_) {
if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) {
......@@ -1640,12 +1689,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
(*phi_kernel_)(impl_->getKernelContext());
} else {
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
if (NeedResetRuntimeContext(scope)) {
std::lock_guard<std::mutex> lock(cache_update_mutex_);
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = cur_scope;
}
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
}
RunImpl(scope, place, runtime_ctx_.get());
}
......@@ -2030,8 +2076,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// To solve issue #15032, have a discussion with @Luotao for cpu inference,
// do not cache transfer scope, hence in this case delete transfer scope
// after run to avoid memory leak
if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
scope.DeleteScope(transfer_scope);
if (cache_transfer_scope_ && !run_by_executor_ &&
!enable_cache_transfer_scope_) {
scope.DeleteScope(cache_transfer_scope_);
}
}
......@@ -2566,33 +2613,25 @@ Scope* OperatorWithKernel::PrepareData(
kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
new_expected_kernel_key->backend() == phi::Backend::GPU ||
new_expected_kernel_key->backend() == phi::Backend::GPUDNN) {
new_scope = TryCreateTransferScope(
cache_transfer_scope_ = TryCreateTransferScope(
kernel_type_for_var, *new_expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
new_scope = cache_transfer_scope_;
}
} else if (kernel_type_for_var.backend() == phi::Backend::GPU ||
kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
expected_kernel_key.backend() == phi::Backend::GPU ||
expected_kernel_key.backend() == phi::Backend::GPUDNN) {
new_scope = TryCreateTransferScope(
cache_transfer_scope_ = TryCreateTransferScope(
kernel_type_for_var, expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
new_scope = cache_transfer_scope_;
}
}
if (!new_scope) {
new_scope = &scope.NewScope();
}
// For inference, if a gpu model has an op which could only run on CPU,
// each result of different input will be the same with the first one.
// The reason is that if a gpu tensor is the input of a cpu kernel,
// we will create a new cpu tensor in new scope.
// However, if enable_cache_runtime_context_, we get the cpu tensor each
// time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
// to trigger `new RuntimeContext()` in RunImpl().
if (enable_cache_runtime_context_) {
pre_scope_ = nullptr;
}
// Create new var with the same name in transfer scopes
auto* trans_var = new_scope->Var(var_name);
......@@ -2678,18 +2717,13 @@ Scope* OperatorWithKernel::PrepareData(
}
}
// If pre_scope = &scope, it means that scope is cached and the op is not in
// while block. If new_scope = nullptr, it means that for each input of this
// Op, there is no need to do PrepareData. So PrepareData could be skipped at
// the rest iterations to save the elapsed time.
// We do not support skipping PrepareData in while block, because the Op's
// input may be changed by subsequent Ops, which may cause an error.
// For inference, ops that behind conditional branch aren't supported well,
// so disable prepare optimization conservatively.
bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
Attr<bool>("inference_force_prepare_data");
if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
if (enable_cache_runtime_context_ && !force_prepare_data) {
need_prepare_data_ = false;
}
......
......@@ -781,18 +781,19 @@ class OperatorWithKernel : public OperatorBase {
// used for IndicateOrPromoteVarDataTypes
phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
const std::string& name) const;
bool NeedResetRuntimeContext(const Scope& scope) const;
protected:
mutable std::unique_ptr<OpKernelType> kernel_type_;
mutable std::unique_ptr<OpKernelFunc> kernel_func_;
mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
mutable const Scope* pre_scope_ = nullptr;
mutable bool need_prepare_data_ = true;
mutable bool need_prepare_phi_data_ = false;
mutable bool enable_cache_runtime_context_ = false;
mutable bool all_kernels_must_compute_runtime_shape_ = false;
mutable std::mutex cache_update_mutex_;
mutable bool enable_cache_transfer_scope_ = false;
mutable Scope* cache_transfer_scope_ = nullptr;
// NOTE(jiahongyu): Whether fallback to plain kernel after calling
// GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard
// code
......
......@@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
}
#endif
#ifdef PADDLE_WITH_XPU
void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
if (!argument->use_xpu()) return;
PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(),
true,
platform::errors::PreconditionNotMet(
"The xpu_device_id field should be valid"));
LOG(INFO) << "Sync params from CPU to XPU: "
<< "xpu_device_id - " << argument->xpu_device_id();
platform::Place place = platform::XPUPlace(argument->xpu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();
for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::PreconditionNotMet("The var should not be nullptr"));
if (var->IsType<phi::DenseTensor>()) {
auto *t = var->GetMutable<phi::DenseTensor>();
platform::CPUPlace cpu_place;
phi::DenseTensor temp_tensor;
temp_tensor.Resize(t->dims());
paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
#endif
void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ(
argument->scope_valid(),
......@@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
if (argument->use_custom_device_valid()) {
CopyParamsToCustomDevice(argument);
}
#endif
#ifdef PADDLE_WITH_XPU
if (argument->use_xpu_valid()) {
CopyParamsToXpu(argument);
}
#endif
paddle::memory::Release(platform::CPUPlace());
}
......
......@@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void CopyParamsToCustomDevice(Argument *argument);
#endif
#ifdef PADDLE_WITH_XPU
void CopyParamsToXpu(Argument *argument);
#endif
};
} // namespace analysis
......
......@@ -1281,6 +1281,18 @@ void AnalysisPredictor::PrepareArgument() {
}
#endif
#ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
argument_->SetXpuLocked(config_.xpu_locked_);
argument_->SetXpuAutotune(config_.xpu_autotune_);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
argument_->SetXpuPrecision(config_.xpu_precision_);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_->SetXpuDeviceId(config_.xpu_device_id_);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
#endif
auto *pass_builder = config_.pass_builder();
// TODO(inference): Need to reconstruct the pass_builder, pass should be
// processed in a single
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册