diff --git a/paddle/fluid/framework/inference_cached_ops.h b/paddle/fluid/framework/inference_cached_ops.h deleted file mode 100644 index 50444e180718a316a5d5c0ec226c4f1aa4e74854..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/inference_cached_ops.h +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -namespace paddle { -namespace framework { - -// cached ops will be captured to accelerate gpu performance. -// 1. op will generate a cudaGraph to record inner gpu kernels -// 2. inner gpu kernels can be launched by calling the cudagraphExecutor -// only once. -std::vector cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c2a665126767c8872896ac8f601f47a0d11f1231..b4ef3efb8216c740cbec9141d099e66bb4838c32 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" -#include "paddle/fluid/framework/inference_cached_ops.h" #include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/shape_inference.h" @@ -710,12 +709,6 @@ class RuntimeInferShapeContext : public InferShapeContext { return in[0] != nullptr; } - size_t InputsSize() const { - auto& op_proto = - paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; - return op_proto->inputs().size(); - } - bool HasOutput(const std::string& name) const override { // has only one output const auto& outs = ctx_.outputs; @@ -1207,86 +1200,7 @@ struct OperatorWithKernel::CacheImpl { return infer_shape_ctx_.get(); } - bool updateInputsShapesDimCache() { - bool flag = false; - size_t inputs_size = - std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize()); - for (size_t i = 0; i < inputs_size; i++) { - const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i); - if (!infer_shape_ctx_->HasInputs(in_name)) continue; - if (!inputs_dim_caches.count(in_name) || - infer_shape_ctx_->GetInputsDim(in_name) != - inputs_dim_caches[in_name]) { - inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name); - flag = true; - } - } - -#if defined(PADDLE_WITH_CUDA) - if (flag) discardCudaGraphCache(); -#endif - return flag; - } - - bool cudaGraphEnabled(bool need_prepare_data, - bool need_prepare_phi_data, - const std::string& op_type) const { -#if defined(PADDLE_WITH_CUDA) - return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) && - !need_prepare_data && !need_prepare_phi_data; -#else - return false; -#endif - } - - bool cacheEnabled(bool run_phi_kernel, - bool need_prepare_data, - bool need_prepare_phi_data, - const std::string& op_type) const { -#if defined(PADDLE_WITH_CUDA) - if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type)) - return true; -#endif - return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data); - } - -#if defined(PADDLE_WITH_CUDA) - void startCudaGraphCapture() { - phi::GPUContext* ctx = static_cast( - platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - auto stream = ctx->stream(); - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); - } - - void endCudaGraphCapture() { - phi::GPUContext* ctx = static_cast( - platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - auto stream = ctx->stream(); - - cudaGraph_t graph_; - cudaStreamEndCapture(stream, &graph_); - cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0); - graph_generated = true; - } - - void runCudaGraph() { - phi::GPUContext* ctx = static_cast( - platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - auto stream = ctx->stream(); - cudaGraphLaunch(graph_instance_, stream); - } - - bool cudaGraphGenerated() { return graph_generated; } - - void discardCudaGraphCache() { graph_generated = false; } - - private: - bool graph_generated{false}; - cudaGraphExec_t graph_instance_; -#endif - private: - std::map> inputs_dim_caches; std::unique_ptr kernel_ctx_; std::unique_ptr infer_shape_ctx_; }; @@ -1496,74 +1410,8 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->Info().infer_shape_(&infer_shape_ctx); } -void OperatorWithKernel::InitOpCache(const Scope& scope, - const platform::Place& place) const { - if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { - std::lock_guard lock(cache_update_mutex_); - if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); - pre_scope_ = &scope; - } - } - - impl_ = - new CacheImpl(new phi::KernelContext(), - new RuntimeInferShapeContext(*this, *runtime_ctx_.get())); - - RunImpl(scope, place, runtime_ctx_.get()); - if (impl_->cacheEnabled(run_phi_kernel_, - need_prepare_data_, - need_prepare_phi_data_, - Type())) { - impl_->updateInputsShapesDimCache(); - } -} - void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - // function name: runOpCache() - // effect: reuse cacheImpl to accelerate inference period - auto runOpCache = [&]() { -#if defined(PADDLE_WITH_CUDA) - if (impl_->cudaGraphEnabled( - need_prepare_data_, need_prepare_phi_data_, Type())) { - // cudaGraph cache - if (impl_->updateInputsShapesDimCache()) { - if (!all_kernels_must_compute_runtime_shape_) - this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); - (*phi_kernel_)(impl_->getKernelContext()); - } else if (!impl_->cudaGraphGenerated()) { - impl_->startCudaGraphCapture(); - impl_->getKernelContext(); - RunImpl(scope, place, runtime_ctx_.get()); - impl_->endCudaGraphCapture(); - } else { - if (!all_kernels_must_compute_runtime_shape_) - this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); - impl_->runCudaGraph(); - } - return; - } -#endif - // common cache - if (!all_kernels_must_compute_runtime_shape_) - this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); - (*phi_kernel_)(impl_->getKernelContext()); - }; - - // function name: updateRuntimeContext - // effect: update runtime_ctx from current scope. - auto updateRuntimeContext = [&](const Scope& scope) { - const Scope* cur_scope = &scope; - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - std::lock_guard lock(cache_update_mutex_); - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); - pre_scope_ = cur_scope; - } - } - }; - // To reduce the elapsed time of HasAttr, we use bool variable to record the // result of HasAttr. if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext)) @@ -1576,18 +1424,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeContext ctx(Inputs(), Outputs(), scope); RunImpl(scope, place, &ctx); pre_scope_ = cur_scope; + } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ && + !need_prepare_phi_data_) { + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + (*phi_kernel_)(impl_->getKernelContext()); } else { - if (!impl_) { - InitOpCache(scope, place); - } else if (impl_->cacheEnabled(run_phi_kernel_, - need_prepare_data_, - need_prepare_phi_data_, - Type())) { - runOpCache(); - } else { - updateRuntimeContext(scope); - RunImpl(scope, place, runtime_ctx_.get()); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } } + RunImpl(scope, place, runtime_ctx_.get()); } } @@ -1852,6 +1702,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, phi::KernelContext phi_kernel_context; if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { + impl_ = + new CacheImpl(new phi::KernelContext(), + new RuntimeInferShapeContext(*this, *runtime_ctx)); BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext()); (*phi_kernel_)(impl_->getKernelContext()); } else { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 4185b450c7a8ce3ad3c465dbf44542df7acde300..17ec9a1f93e723f5381e259669429db8209e9b3f 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -712,7 +712,6 @@ class OperatorWithKernel : public OperatorBase { // used for IndicateOrPromoteVarDataTypes Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx, const std::string& name) const; - void InitOpCache(const Scope& scope, const platform::Place& place) const; protected: mutable std::unique_ptr kernel_type_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 739c146635335c877cd6e9ca485517dfc5dec141..6119714c38cc12bf18734cae28240a5315795079 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -165,7 +165,6 @@ const std::vector kGpuLowerPrecisionPasses{ "gpu_cpu_map_matmul_v2_to_matmul_pass", "fc_fuse_pass", "fc_elementwise_layernorm_fuse_pass", - "runtime_context_cache_pass", }; const std::vector kTrtLowerPrecisionPasses{