diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 747a17bd05e08d2362acabccd0efc6372d40e36b..5edb49345f2c6015e829b06b881d852c21a4e417 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1618,57 +1618,6 @@ void OperatorWithKernel::CheckWhetherPreparePhiData( } } -// When do we need to reset runtime context? -// 1. When enable cache runtime context, if the program runs for the first time, -// runtime_ctx_.get() == nullptr, we need to create a new runtime context. -// 2. When enable cache runtime context, if the program is not running for the -// first time, -// but the input shape or tensor layout of the operator has changed, we cannot -// use the runtime context stored in the cache at this time, and need to -// create a new one. -bool OperatorWithKernel::NeedResetRuntimeContext(const Scope& scope) const { - if (runtime_ctx_.get() == nullptr) return true; - const auto& name_map = Inputs(); - for (auto& var_name_item : name_map) { - auto& name_vec = var_name_item.second; - std::vector& cache_input_vars = - runtime_ctx_->inputs[var_name_item.first]; - PADDLE_ENFORCE_EQ( - name_vec.size(), - cache_input_vars.size(), - platform::errors::InvalidArgument( - "The size of input variable names (%d) must be equal to " - "the size of cache input variable ptrs (%d).", - name_vec.size(), - cache_input_vars.size())); - for (size_t i = 0; i < name_vec.size(); i++) { - auto var_name = name_vec[i]; - auto* cache_input_var = cache_input_vars[i]; - if (!VarIsTensor(*cache_input_var)) continue; - auto* cache_input_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(cache_input_var); - auto cache_input_tensor_dims = cache_input_tensor->dims(); - auto* current_input_var = scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - current_input_var, - platform::errors::NotFound( - "The variable %s is not found when " - "enable_cache_runtime_context_cache in origin scope.", - var_name)); - auto* current_input_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(current_input_var); - auto current_input_tensor_dims = current_input_tensor->dims(); - if (cache_input_tensor_dims != current_input_tensor_dims || - NeedTransformLayout(current_input_tensor->layout(), - cache_input_tensor->layout())) { - need_prepare_data_ = true; - return true; - } - } - } - return false; -} - void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { // To reduce the elapsed time of HasAttr, we use bool variable to record the @@ -1678,6 +1627,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_ && HasAttr(kAllKernelsMustComputeRuntimeShape)) all_kernels_must_compute_runtime_shape_ = true; + const Scope* cur_scope = &scope; CheckWhetherPreparePhiData(Inputs(), Outputs(), scope); if (!enable_cache_runtime_context_) { RuntimeContext ctx(Inputs(), Outputs(), scope); @@ -1689,9 +1639,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } (*phi_kernel_)(impl_->getKernelContext()); } else { - if (NeedResetRuntimeContext(scope)) { + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { std::lock_guard lock(cache_update_mutex_); - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } } RunImpl(scope, place, runtime_ctx_.get()); } @@ -2086,9 +2039,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // To solve issue #15032, have a discussion with @Luotao for cpu inference, // do not cache transfer scope, hence in this case delete transfer scope // after run to avoid memory leak - if (cache_transfer_scope_ && !run_by_executor_ && - !enable_cache_transfer_scope_) { - scope.DeleteScope(cache_transfer_scope_); + if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) { + scope.DeleteScope(transfer_scope); } } @@ -2623,25 +2575,33 @@ Scope* OperatorWithKernel::PrepareData( kernel_type_for_var.backend() == phi::Backend::GPUDNN || new_expected_kernel_key->backend() == phi::Backend::GPU || new_expected_kernel_key->backend() == phi::Backend::GPUDNN) { - cache_transfer_scope_ = TryCreateTransferScope( + new_scope = TryCreateTransferScope( kernel_type_for_var, *new_expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; - new_scope = cache_transfer_scope_; } } else if (kernel_type_for_var.backend() == phi::Backend::GPU || kernel_type_for_var.backend() == phi::Backend::GPUDNN || expected_kernel_key.backend() == phi::Backend::GPU || expected_kernel_key.backend() == phi::Backend::GPUDNN) { - cache_transfer_scope_ = TryCreateTransferScope( + new_scope = TryCreateTransferScope( kernel_type_for_var, expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; - new_scope = cache_transfer_scope_; } } if (!new_scope) { new_scope = &scope.NewScope(); } + // For inference, if a gpu model has an op which could only run on CPU, + // each result of different input will be the same with the first one. + // The reason is that if a gpu tensor is the input of a cpu kernel, + // we will create a new cpu tensor in new scope. + // However, if enable_cache_runtime_context_, we get the cpu tensor each + // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr + // to trigger `new RuntimeContext()` in RunImpl(). + if (enable_cache_runtime_context_) { + pre_scope_ = nullptr; + } // Create new var with the same name in transfer scopes auto* trans_var = new_scope->Var(var_name); @@ -2727,13 +2687,18 @@ Scope* OperatorWithKernel::PrepareData( } } + // If pre_scope = &scope, it means that scope is cached and the op is not in + // while block. If new_scope = nullptr, it means that for each input of this + // Op, there is no need to do PrepareData. So PrepareData could be skipped at + // the rest iterations to save the elapsed time. // We do not support skipping PrepareData in while block, because the Op's // input may be changed by subsequent Ops, which may cause an error. + // For inference, ops that behind conditional branch aren't supported well, // so disable prepare optimization conservatively. bool force_prepare_data = HasAttr("inference_force_prepare_data") && Attr("inference_force_prepare_data"); - if (enable_cache_runtime_context_ && !force_prepare_data) { + if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) { need_prepare_data_ = false; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2bb5ca2c4a13ff053ea2d89b7c71833e91e442c4..f2de07db96df05e8d17258a5257bfa45e6741062 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -781,19 +781,18 @@ class OperatorWithKernel : public OperatorBase { // used for IndicateOrPromoteVarDataTypes phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx, const std::string& name) const; - bool NeedResetRuntimeContext(const Scope& scope) const; protected: mutable std::unique_ptr kernel_type_; mutable std::unique_ptr kernel_func_; mutable std::unique_ptr runtime_ctx_; + mutable const Scope* pre_scope_ = nullptr; mutable bool need_prepare_data_ = true; mutable bool need_prepare_phi_data_ = false; mutable bool enable_cache_runtime_context_ = false; mutable bool all_kernels_must_compute_runtime_shape_ = false; mutable std::mutex cache_update_mutex_; mutable bool enable_cache_transfer_scope_ = false; - mutable Scope* cache_transfer_scope_ = nullptr; // NOTE(jiahongyu): Whether fallback to plain kernel after calling // GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard // code