diff --git a/paddle/fluid/framework/inference_cached_ops.h b/paddle/fluid/framework/inference_cached_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..50444e180718a316a5d5c0ec226c4f1aa4e74854 --- /dev/null +++ b/paddle/fluid/framework/inference_cached_ops.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace framework { + +// cached ops will be captured to accelerate gpu performance. +// 1. op will generate a cudaGraph to record inner gpu kernels +// 2. inner gpu kernels can be launched by calling the cudagraphExecutor +// only once. +std::vector cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b4ef3efb8216c740cbec9141d099e66bb4838c32..c2a665126767c8872896ac8f601f47a0d11f1231 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" +#include "paddle/fluid/framework/inference_cached_ops.h" #include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/shape_inference.h" @@ -709,6 +710,12 @@ class RuntimeInferShapeContext : public InferShapeContext { return in[0] != nullptr; } + size_t InputsSize() const { + auto& op_proto = + paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; + return op_proto->inputs().size(); + } + bool HasOutput(const std::string& name) const override { // has only one output const auto& outs = ctx_.outputs; @@ -1200,7 +1207,86 @@ struct OperatorWithKernel::CacheImpl { return infer_shape_ctx_.get(); } + bool updateInputsShapesDimCache() { + bool flag = false; + size_t inputs_size = + std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize()); + for (size_t i = 0; i < inputs_size; i++) { + const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i); + if (!infer_shape_ctx_->HasInputs(in_name)) continue; + if (!inputs_dim_caches.count(in_name) || + infer_shape_ctx_->GetInputsDim(in_name) != + inputs_dim_caches[in_name]) { + inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name); + flag = true; + } + } + +#if defined(PADDLE_WITH_CUDA) + if (flag) discardCudaGraphCache(); +#endif + return flag; + } + + bool cudaGraphEnabled(bool need_prepare_data, + bool need_prepare_phi_data, + const std::string& op_type) const { +#if defined(PADDLE_WITH_CUDA) + return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) && + !need_prepare_data && !need_prepare_phi_data; +#else + return false; +#endif + } + + bool cacheEnabled(bool run_phi_kernel, + bool need_prepare_data, + bool need_prepare_phi_data, + const std::string& op_type) const { +#if defined(PADDLE_WITH_CUDA) + if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type)) + return true; +#endif + return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data); + } + +#if defined(PADDLE_WITH_CUDA) + void startCudaGraphCapture() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + } + + void endCudaGraphCapture() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + + cudaGraph_t graph_; + cudaStreamEndCapture(stream, &graph_); + cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0); + graph_generated = true; + } + + void runCudaGraph() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + cudaGraphLaunch(graph_instance_, stream); + } + + bool cudaGraphGenerated() { return graph_generated; } + + void discardCudaGraphCache() { graph_generated = false; } + + private: + bool graph_generated{false}; + cudaGraphExec_t graph_instance_; +#endif + private: + std::map> inputs_dim_caches; std::unique_ptr kernel_ctx_; std::unique_ptr infer_shape_ctx_; }; @@ -1410,8 +1496,74 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->Info().infer_shape_(&infer_shape_ctx); } +void OperatorWithKernel::InitOpCache(const Scope& scope, + const platform::Place& place) const { + if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = &scope; + } + } + + impl_ = + new CacheImpl(new phi::KernelContext(), + new RuntimeInferShapeContext(*this, *runtime_ctx_.get())); + + RunImpl(scope, place, runtime_ctx_.get()); + if (impl_->cacheEnabled(run_phi_kernel_, + need_prepare_data_, + need_prepare_phi_data_, + Type())) { + impl_->updateInputsShapesDimCache(); + } +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { + // function name: runOpCache() + // effect: reuse cacheImpl to accelerate inference period + auto runOpCache = [&]() { +#if defined(PADDLE_WITH_CUDA) + if (impl_->cudaGraphEnabled( + need_prepare_data_, need_prepare_phi_data_, Type())) { + // cudaGraph cache + if (impl_->updateInputsShapesDimCache()) { + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + (*phi_kernel_)(impl_->getKernelContext()); + } else if (!impl_->cudaGraphGenerated()) { + impl_->startCudaGraphCapture(); + impl_->getKernelContext(); + RunImpl(scope, place, runtime_ctx_.get()); + impl_->endCudaGraphCapture(); + } else { + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + impl_->runCudaGraph(); + } + return; + } +#endif + // common cache + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + (*phi_kernel_)(impl_->getKernelContext()); + }; + + // function name: updateRuntimeContext + // effect: update runtime_ctx from current scope. + auto updateRuntimeContext = [&](const Scope& scope) { + const Scope* cur_scope = &scope; + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } + } + }; + // To reduce the elapsed time of HasAttr, we use bool variable to record the // result of HasAttr. if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext)) @@ -1424,20 +1576,18 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeContext ctx(Inputs(), Outputs(), scope); RunImpl(scope, place, &ctx); pre_scope_ = cur_scope; - } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ && - !need_prepare_phi_data_) { - if (!all_kernels_must_compute_runtime_shape_) - this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); - (*phi_kernel_)(impl_->getKernelContext()); } else { - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - std::lock_guard lock(cache_update_mutex_); - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); - pre_scope_ = cur_scope; - } + if (!impl_) { + InitOpCache(scope, place); + } else if (impl_->cacheEnabled(run_phi_kernel_, + need_prepare_data_, + need_prepare_phi_data_, + Type())) { + runOpCache(); + } else { + updateRuntimeContext(scope); + RunImpl(scope, place, runtime_ctx_.get()); } - RunImpl(scope, place, runtime_ctx_.get()); } } @@ -1702,9 +1852,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, phi::KernelContext phi_kernel_context; if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { - impl_ = - new CacheImpl(new phi::KernelContext(), - new RuntimeInferShapeContext(*this, *runtime_ctx)); BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext()); (*phi_kernel_)(impl_->getKernelContext()); } else { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 17ec9a1f93e723f5381e259669429db8209e9b3f..4185b450c7a8ce3ad3c465dbf44542df7acde300 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -712,6 +712,7 @@ class OperatorWithKernel : public OperatorBase { // used for IndicateOrPromoteVarDataTypes Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx, const std::string& name) const; + void InitOpCache(const Scope& scope, const platform::Place& place) const; protected: mutable std::unique_ptr kernel_type_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 6119714c38cc12bf18734cae28240a5315795079..739c146635335c877cd6e9ca485517dfc5dec141 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -165,6 +165,7 @@ const std::vector kGpuLowerPrecisionPasses{ "gpu_cpu_map_matmul_v2_to_matmul_pass", "fc_fuse_pass", "fc_elementwise_layernorm_fuse_pass", + "runtime_context_cache_pass", }; const std::vector kTrtLowerPrecisionPasses{