[Paddle Inference] Support cuda_graph. (#44878)

* cuda_graph * cuda_graph_ * cuda_graph_ * cuda_graph_

[Paddle Inference] Support cuda_graph. (#44878)
* cuda_graph * cuda_graph_ * cuda_graph_ * cuda_graph_
84bf5c31 · xiaoxiaohehe001 · GitHub · 93c5c887 · 84bf5c31 · 84bf5c31
4 changed file
--- a/paddle/fluid/framework/inference_cached_ops.h
+++ b/paddle/fluid/framework/inference_cached_ops.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+// cached ops will be captured to accelerate gpu performance.
+//      1. op will generate a cudaGraph to record inner gpu kernels
+//      2. inner gpu kernels can be launched by calling the cudagraphExecutor
+//      only once.
+std::vector<std::string> cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/inference_cached_ops.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -709,6 +710,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return in[0] != nullptr;
  }

+  size_t InputsSize() const {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    return op_proto->inputs().size();
+  }
+
  bool HasOutput(const std::string& name) const override {
    // has only one output
    const auto& outs = ctx_.outputs;
@@ -1200,7 +1207,86 @@ struct OperatorWithKernel::CacheImpl {
    return infer_shape_ctx_.get();
  }

+  bool updateInputsShapesDimCache() {
+    bool flag = false;
+    size_t inputs_size =
+        std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize());
+    for (size_t i = 0; i < inputs_size; i++) {
+      const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i);
+      if (!infer_shape_ctx_->HasInputs(in_name)) continue;
+      if (!inputs_dim_caches.count(in_name) ||
+          infer_shape_ctx_->GetInputsDim(in_name) !=
+              inputs_dim_caches[in_name]) {
+        inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name);
+        flag = true;
+      }
+    }
+
+#if defined(PADDLE_WITH_CUDA)
+    if (flag) discardCudaGraphCache();
+#endif
+    return flag;
+  }
+
+  bool cudaGraphEnabled(bool need_prepare_data,
+                        bool need_prepare_phi_data,
+                        const std::string& op_type) const {
+#if defined(PADDLE_WITH_CUDA)
+    return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) &&
+           !need_prepare_data && !need_prepare_phi_data;
+#else
+    return false;
+#endif
+  }
+
+  bool cacheEnabled(bool run_phi_kernel,
+                    bool need_prepare_data,
+                    bool need_prepare_phi_data,
+                    const std::string& op_type) const {
+#if defined(PADDLE_WITH_CUDA)
+    if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type))
+      return true;
+#endif
+    return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data);
+  }
+
+#if defined(PADDLE_WITH_CUDA)
+  void startCudaGraphCapture() {
+    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+    auto stream = ctx->stream();
+    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+  }
+
+  void endCudaGraphCapture() {
+    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+    auto stream = ctx->stream();
+
+    cudaGraph_t graph_;
+    cudaStreamEndCapture(stream, &graph_);
+    cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0);
+    graph_generated = true;
+  }
+
+  void runCudaGraph() {
+    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+    auto stream = ctx->stream();
+    cudaGraphLaunch(graph_instance_, stream);
+  }
+
+  bool cudaGraphGenerated() { return graph_generated; }
+
+  void discardCudaGraphCache() { graph_generated = false; }
+
 private:
+  bool graph_generated{false};
+  cudaGraphExec_t graph_instance_;
+#endif
+
+ private:
+  std::map<std::string, std::vector<DDim>> inputs_dim_caches;
  std::unique_ptr<phi::KernelContext> kernel_ctx_;
  std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
 };
@@ -1410,8 +1496,74 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->Info().infer_shape_(&infer_shape_ctx);
 }

+void OperatorWithKernel::InitOpCache(const Scope& scope,
+                                     const platform::Place& place) const {
+  if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
+    std::lock_guard<std::mutex> lock(cache_update_mutex_);
+    if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = &scope;
+    }
+  }
+
+  impl_ =
+      new CacheImpl(new phi::KernelContext(),
+                    new RuntimeInferShapeContext(*this, *runtime_ctx_.get()));
+
+  RunImpl(scope, place, runtime_ctx_.get());
+  if (impl_->cacheEnabled(run_phi_kernel_,
+                          need_prepare_data_,
+                          need_prepare_phi_data_,
+                          Type())) {
+    impl_->updateInputsShapesDimCache();
+  }
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
+  // function name: runOpCache()
+  //    effect:  reuse cacheImpl to accelerate inference period
+  auto runOpCache = [&]() {
+#if defined(PADDLE_WITH_CUDA)
+    if (impl_->cudaGraphEnabled(
+            need_prepare_data_, need_prepare_phi_data_, Type())) {
+      // cudaGraph cache
+      if (impl_->updateInputsShapesDimCache()) {
+        if (!all_kernels_must_compute_runtime_shape_)
+          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+        (*phi_kernel_)(impl_->getKernelContext());
+      } else if (!impl_->cudaGraphGenerated()) {
+        impl_->startCudaGraphCapture();
+        impl_->getKernelContext();
+        RunImpl(scope, place, runtime_ctx_.get());
+        impl_->endCudaGraphCapture();
+      } else {
+        if (!all_kernels_must_compute_runtime_shape_)
+          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+        impl_->runCudaGraph();
+      }
+      return;
+    }
+#endif
+    // common cache
+    if (!all_kernels_must_compute_runtime_shape_)
+      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    (*phi_kernel_)(impl_->getKernelContext());
+  };
+
+  // function name: updateRuntimeContext
+  //        effect: update runtime_ctx from current scope.
+  auto updateRuntimeContext = [&](const Scope& scope) {
+    const Scope* cur_scope = &scope;
+    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+      std::lock_guard<std::mutex> lock(cache_update_mutex_);
+      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+        pre_scope_ = cur_scope;
+      }
+    }
+  };
+
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
  // result of HasAttr.
  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
@@ -1424,21 +1576,19 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
    pre_scope_ = cur_scope;
-  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
-             !need_prepare_phi_data_) {
-    if (!all_kernels_must_compute_runtime_shape_)
-      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-    (*phi_kernel_)(impl_->getKernelContext());
  } else {
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
-    }
+    if (!impl_) {
+      InitOpCache(scope, place);
+    } else if (impl_->cacheEnabled(run_phi_kernel_,
+                                   need_prepare_data_,
+                                   need_prepare_phi_data_,
+                                   Type())) {
+      runOpCache();
+    } else {
+      updateRuntimeContext(scope);
      RunImpl(scope, place, runtime_ctx_.get());
    }
+  }
 }

 void OperatorWithKernel::RunImpl(const Scope& scope,
@@ -1702,9 +1852,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      phi::KernelContext phi_kernel_context;
      if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
          !need_prepare_data_) {
-        impl_ =
-            new CacheImpl(new phi::KernelContext(),
-                          new RuntimeInferShapeContext(*this, *runtime_ctx));
        BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
        (*phi_kernel_)(impl_->getKernelContext());
      } else {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -712,6 +712,7 @@ class OperatorWithKernel : public OperatorBase {
  // used for IndicateOrPromoteVarDataTypes
  Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                   const std::string& name) const;
+  void InitOpCache(const Scope& scope, const platform::Place& place) const;

 protected:
  mutable std::unique_ptr<OpKernelType> kernel_type_;

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -165,6 +165,7 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
    "gpu_cpu_map_matmul_v2_to_matmul_pass",
    "fc_fuse_pass",
    "fc_elementwise_layernorm_fuse_pass",
+    "runtime_context_cache_pass",
 };

 const std::vector<std::string> kTrtLowerPrecisionPasses{