未验证 提交 84bf5c31 编写于 作者: X xiaoxiaohehe001 提交者: GitHub

[Paddle Inference] Support cuda_graph. (#44878)

* cuda_graph

* cuda_graph_

* cuda_graph_

* cuda_graph_
上级 93c5c887
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
namespace paddle {
namespace framework {
// cached ops will be captured to accelerate gpu performance.
// 1. op will generate a cudaGraph to record inner gpu kernels
// 2. inner gpu kernels can be launched by calling the cudagraphExecutor
// only once.
std::vector<std::string> cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"};
} // namespace framework
} // namespace paddle
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/inference_cached_ops.h"
#include "paddle/fluid/framework/op_call_stack.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/shape_inference.h"
......@@ -709,6 +710,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
return in[0] != nullptr;
}
size_t InputsSize() const {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
return op_proto->inputs().size();
}
bool HasOutput(const std::string& name) const override {
// has only one output
const auto& outs = ctx_.outputs;
......@@ -1200,7 +1207,86 @@ struct OperatorWithKernel::CacheImpl {
return infer_shape_ctx_.get();
}
bool updateInputsShapesDimCache() {
bool flag = false;
size_t inputs_size =
std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize());
for (size_t i = 0; i < inputs_size; i++) {
const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i);
if (!infer_shape_ctx_->HasInputs(in_name)) continue;
if (!inputs_dim_caches.count(in_name) ||
infer_shape_ctx_->GetInputsDim(in_name) !=
inputs_dim_caches[in_name]) {
inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name);
flag = true;
}
}
#if defined(PADDLE_WITH_CUDA)
if (flag) discardCudaGraphCache();
#endif
return flag;
}
bool cudaGraphEnabled(bool need_prepare_data,
bool need_prepare_phi_data,
const std::string& op_type) const {
#if defined(PADDLE_WITH_CUDA)
return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) &&
!need_prepare_data && !need_prepare_phi_data;
#else
return false;
#endif
}
bool cacheEnabled(bool run_phi_kernel,
bool need_prepare_data,
bool need_prepare_phi_data,
const std::string& op_type) const {
#if defined(PADDLE_WITH_CUDA)
if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type))
return true;
#endif
return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data);
}
#if defined(PADDLE_WITH_CUDA)
void startCudaGraphCapture() {
phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
auto stream = ctx->stream();
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
}
void endCudaGraphCapture() {
phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
auto stream = ctx->stream();
cudaGraph_t graph_;
cudaStreamEndCapture(stream, &graph_);
cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0);
graph_generated = true;
}
void runCudaGraph() {
phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
auto stream = ctx->stream();
cudaGraphLaunch(graph_instance_, stream);
}
bool cudaGraphGenerated() { return graph_generated; }
void discardCudaGraphCache() { graph_generated = false; }
private:
bool graph_generated{false};
cudaGraphExec_t graph_instance_;
#endif
private:
std::map<std::string, std::vector<DDim>> inputs_dim_caches;
std::unique_ptr<phi::KernelContext> kernel_ctx_;
std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
};
......@@ -1410,8 +1496,74 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this->Info().infer_shape_(&infer_shape_ctx);
}
void OperatorWithKernel::InitOpCache(const Scope& scope,
const platform::Place& place) const {
if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
std::lock_guard<std::mutex> lock(cache_update_mutex_);
if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = &scope;
}
}
impl_ =
new CacheImpl(new phi::KernelContext(),
new RuntimeInferShapeContext(*this, *runtime_ctx_.get()));
RunImpl(scope, place, runtime_ctx_.get());
if (impl_->cacheEnabled(run_phi_kernel_,
need_prepare_data_,
need_prepare_phi_data_,
Type())) {
impl_->updateInputsShapesDimCache();
}
}
void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const {
// function name: runOpCache()
// effect: reuse cacheImpl to accelerate inference period
auto runOpCache = [&]() {
#if defined(PADDLE_WITH_CUDA)
if (impl_->cudaGraphEnabled(
need_prepare_data_, need_prepare_phi_data_, Type())) {
// cudaGraph cache
if (impl_->updateInputsShapesDimCache()) {
if (!all_kernels_must_compute_runtime_shape_)
this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
(*phi_kernel_)(impl_->getKernelContext());
} else if (!impl_->cudaGraphGenerated()) {
impl_->startCudaGraphCapture();
impl_->getKernelContext();
RunImpl(scope, place, runtime_ctx_.get());
impl_->endCudaGraphCapture();
} else {
if (!all_kernels_must_compute_runtime_shape_)
this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
impl_->runCudaGraph();
}
return;
}
#endif
// common cache
if (!all_kernels_must_compute_runtime_shape_)
this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
(*phi_kernel_)(impl_->getKernelContext());
};
// function name: updateRuntimeContext
// effect: update runtime_ctx from current scope.
auto updateRuntimeContext = [&](const Scope& scope) {
const Scope* cur_scope = &scope;
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
std::lock_guard<std::mutex> lock(cache_update_mutex_);
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = cur_scope;
}
}
};
// To reduce the elapsed time of HasAttr, we use bool variable to record the
// result of HasAttr.
if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
......@@ -1424,21 +1576,19 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx);
pre_scope_ = cur_scope;
} else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
!need_prepare_phi_data_) {
if (!all_kernels_must_compute_runtime_shape_)
this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
(*phi_kernel_)(impl_->getKernelContext());
} else {
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
std::lock_guard<std::mutex> lock(cache_update_mutex_);
if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = cur_scope;
}
}
if (!impl_) {
InitOpCache(scope, place);
} else if (impl_->cacheEnabled(run_phi_kernel_,
need_prepare_data_,
need_prepare_phi_data_,
Type())) {
runOpCache();
} else {
updateRuntimeContext(scope);
RunImpl(scope, place, runtime_ctx_.get());
}
}
}
void OperatorWithKernel::RunImpl(const Scope& scope,
......@@ -1702,9 +1852,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
phi::KernelContext phi_kernel_context;
if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
!need_prepare_data_) {
impl_ =
new CacheImpl(new phi::KernelContext(),
new RuntimeInferShapeContext(*this, *runtime_ctx));
BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
(*phi_kernel_)(impl_->getKernelContext());
} else {
......
......@@ -712,6 +712,7 @@ class OperatorWithKernel : public OperatorBase {
// used for IndicateOrPromoteVarDataTypes
Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
const std::string& name) const;
void InitOpCache(const Scope& scope, const platform::Place& place) const;
protected:
mutable std::unique_ptr<OpKernelType> kernel_type_;
......
......@@ -165,6 +165,7 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
"gpu_cpu_map_matmul_v2_to_matmul_pass",
"fc_fuse_pass",
"fc_elementwise_layernorm_fuse_pass",
"runtime_context_cache_pass",
};
const std::vector<std::string> kTrtLowerPrecisionPasses{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册