Optimize the ernie inference performance on xpu backend. (#50357)

* Optimize the ernie inference performance on xpu * fix enable runtime cache logic * when op's input shape has changed, should create a new runtime context * fix * set flag when input shape has changed

Optimize the ernie inference performance on xpu backend. (#50357)
* Optimize the ernie inference performance on xpu * fix enable runtime cache logic * when op's input shape has changed, should create a new runtime context * fix * set flag when input shape has changed
b39afb13 · csy0225 · GitHub · 1e7dc9c0 · b39afb13 · b39afb13
5 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1618,6 +1618,57 @@ void OperatorWithKernel::CheckWhetherPreparePhiData(
  }
 }
+// When do we need to reset runtime context?
+// 1. When enable cache runtime context, if the program runs for the first time,
+//   runtime_ctx_.get() == nullptr, we need to create a new runtime context.
+// 2. When enable cache runtime context, if the program is not running for the
+// first time,
+//   but the input shape or tensor layout of the operator has changed, we cannot
+//   use the runtime context stored in the cache at this time, and need to
+//   create a new one.
+bool OperatorWithKernel::NeedResetRuntimeContext(const Scope& scope) const {
+  if (runtime_ctx_.get() == nullptr) return true;
+  const auto& name_map = Inputs();
+  for (auto& var_name_item : name_map) {
+    auto& name_vec = var_name_item.second;
+    std::vector<Variable*>& cache_input_vars =
+        runtime_ctx_->inputs[var_name_item.first];
+    PADDLE_ENFORCE_EQ(
+        name_vec.size(),
+        cache_input_vars.size(),
+        platform::errors::InvalidArgument(
+            "The size of input variable names (%d) must be equal to "
+            "the size of cache input variable ptrs (%d).",
+            name_vec.size(),
+            cache_input_vars.size()));
+    for (size_t i = 0; i < name_vec.size(); i++) {
+      auto var_name = name_vec[i];
+      auto* cache_input_var = cache_input_vars[i];
+      if (!VarIsTensor(*cache_input_var)) continue;
+      auto* cache_input_tensor =
+          GetMutableLoDTensorOrSelectedRowsValueFromVar(cache_input_var);
+      auto cache_input_tensor_dims = cache_input_tensor->dims();
+      auto* current_input_var = scope.FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          current_input_var,
+          platform::errors::NotFound(
+              "The variable %s is not found when "
+              "enable_cache_runtime_context_cache in origin scope.",
+              var_name));
+      auto* current_input_tensor =
+          GetMutableLoDTensorOrSelectedRowsValueFromVar(current_input_var);
+      auto current_input_tensor_dims = current_input_tensor->dims();
+      if (cache_input_tensor_dims != current_input_tensor_dims ||
+          NeedTransformLayout(current_input_tensor->layout(),
+                              cache_input_tensor->layout())) {
+        need_prepare_data_ = true;
+        return true;
+      }
+    }
+  }
+  return false;
+}
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1627,12 +1678,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  if (!all_kernels_must_compute_runtime_shape_ &&
      HasAttr(kAllKernelsMustComputeRuntimeShape))
    all_kernels_must_compute_runtime_shape_ = true;
-  const Scope* cur_scope = &scope;
  CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
  if (!enable_cache_runtime_context_) {
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
-    pre_scope_ = cur_scope;
  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
             !need_prepare_phi_data_) {
    if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) {
@@ -1640,12 +1689,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    }
    (*phi_kernel_)(impl_->getKernelContext());
  } else {
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+    if (NeedResetRuntimeContext(scope)) {
      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
    }
    RunImpl(scope, place, runtime_ctx_.get());
  }
@@ -2030,8 +2076,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  // To solve issue #15032, have a discussion with @Luotao for cpu inference,
  // do not cache transfer scope, hence in this case delete transfer scope
  // after run to avoid memory leak
-  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
+  if (cache_transfer_scope_ && !run_by_executor_ &&
-    scope.DeleteScope(transfer_scope);
+      !enable_cache_transfer_scope_) {
+    scope.DeleteScope(cache_transfer_scope_);
  }
 }
@@ -2566,33 +2613,25 @@ Scope* OperatorWithKernel::PrepareData(
              kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
              new_expected_kernel_key->backend() == phi::Backend::GPU ||
              new_expected_kernel_key->backend() == phi::Backend::GPUDNN) {
-            new_scope = TryCreateTransferScope(
+            cache_transfer_scope_ = TryCreateTransferScope(
                kernel_type_for_var, *new_expected_kernel_key, &scope);
            enable_cache_transfer_scope_ = true;
+            new_scope = cache_transfer_scope_;
          }
        } else if (kernel_type_for_var.backend() == phi::Backend::GPU ||
                   kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
                   expected_kernel_key.backend() == phi::Backend::GPU ||
                   expected_kernel_key.backend() == phi::Backend::GPUDNN) {
-          new_scope = TryCreateTransferScope(
+          cache_transfer_scope_ = TryCreateTransferScope(
              kernel_type_for_var, expected_kernel_key, &scope);
          enable_cache_transfer_scope_ = true;
+          new_scope = cache_transfer_scope_;
        }
      }
      if (!new_scope) {
        new_scope = &scope.NewScope();
      }
-      // For inference, if a gpu model has an op which could only run on CPU,
-      // each result of different input will be the same with the first one.
-      // The reason is that if a gpu tensor is the input of a cpu kernel,
-      // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context_, we get the cpu tensor each
-      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
-      // to trigger `new RuntimeContext()` in RunImpl().
-      if (enable_cache_runtime_context_) {
-        pre_scope_ = nullptr;
-      }
      // Create new var with the same name in transfer scopes
      auto* trans_var = new_scope->Var(var_name);
@@ -2678,18 +2717,13 @@ Scope* OperatorWithKernel::PrepareData(
    }
  }
-  // If pre_scope = &scope, it means that scope is cached and the op is not in
-  // while block. If new_scope = nullptr, it means that for each input of this
-  // Op, there is no need to do PrepareData. So PrepareData could be skipped at
-  // the rest iterations to save the elapsed time.
  // We do not support skipping PrepareData in while block, because the Op's
  // input may be changed by subsequent Ops, which may cause an error.
  // For inference, ops that behind conditional branch aren't supported well,
  // so disable prepare optimization conservatively.
  bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
                            Attr<bool>("inference_force_prepare_data");
-  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
+  if (enable_cache_runtime_context_ && !force_prepare_data) {
    need_prepare_data_ = false;
  }

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -781,18 +781,19 @@ class OperatorWithKernel : public OperatorBase {
  // used for IndicateOrPromoteVarDataTypes
  phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                             const std::string& name) const;
+  bool NeedResetRuntimeContext(const Scope& scope) const;
 protected:
  mutable std::unique_ptr<OpKernelType> kernel_type_;
  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
-  mutable const Scope* pre_scope_ = nullptr;
  mutable bool need_prepare_data_ = true;
  mutable bool need_prepare_phi_data_ = false;
  mutable bool enable_cache_runtime_context_ = false;
  mutable bool all_kernels_must_compute_runtime_shape_ = false;
  mutable std::mutex cache_update_mutex_;
  mutable bool enable_cache_transfer_scope_ = false;
+  mutable Scope* cache_transfer_scope_ = nullptr;
  // NOTE(jiahongyu): Whether fallback to plain kernel after calling
  // GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard
  // code

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
 }
 #endif
+#ifdef PADDLE_WITH_XPU
+void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
+  if (!argument->use_xpu()) return;
+  PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "The xpu_device_id field should be valid"));
+  LOG(INFO) << "Sync params from CPU to XPU: "
+            << "xpu_device_id - " << argument->xpu_device_id();
+  platform::Place place = platform::XPUPlace(argument->xpu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));
+    if (var->IsType<phi::DenseTensor>()) {
+      auto *t = var->GetMutable<phi::DenseTensor>();
+      platform::CPUPlace cpu_place;
+      phi::DenseTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      t->clear();
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+#endif
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  PADDLE_ENFORCE_EQ(
      argument->scope_valid(),
@@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  if (argument->use_custom_device_valid()) {
    CopyParamsToCustomDevice(argument);
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (argument->use_xpu_valid()) {
+    CopyParamsToXpu(argument);
+  }
 #endif
  paddle::memory::Release(platform::CPUPlace());
 }

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  void CopyParamsToCustomDevice(Argument *argument);
 #endif
+#ifdef PADDLE_WITH_XPU
+  void CopyParamsToXpu(Argument *argument);
+#endif
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1281,6 +1281,18 @@ void AnalysisPredictor::PrepareArgument() {
  }
 #endif
+#ifdef PADDLE_WITH_XPU
+  argument_->SetUseXpu(config_.use_xpu_);
+  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+  argument_->SetXpuLocked(config_.xpu_locked_);
+  argument_->SetXpuAutotune(config_.xpu_autotune_);
+  argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
+  argument_->SetXpuPrecision(config_.xpu_precision_);
+  argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
+  argument_->SetXpuDeviceId(config_.xpu_device_id_);
+  argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
+#endif
  auto *pass_builder = config_.pass_builder();
  // TODO(inference): Need to reconstruct the pass_builder, pass should be
  // processed in a single