Revert "[Paddle Inference] Support cuda_graph. (#44878)" (#45115)

This reverts commit 84bf5c31.

Revert "[Paddle Inference] Support cuda_graph. (#44878)" (#45115)
This reverts commit 84bf5c31.
b0e7681f · xiaoxiaohehe001 · GitHub · e96dae8b · e96dae8b · b0e7681f
4 changed file
--- a/paddle/fluid/framework/inference_cached_ops.h
+++ b/paddle/fluid/framework/inference_cached_ops.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-// cached ops will be captured to accelerate gpu performance.
-//      1. op will generate a cudaGraph to record inner gpu kernels
-//      2. inner gpu kernels can be launched by calling the cudagraphExecutor
-//      only once.
-std::vector<std::string> cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"};
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/inference_cached_ops.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -710,12 +709,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return in[0] != nullptr;
  }

-  size_t InputsSize() const {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    return op_proto->inputs().size();
-  }
-
  bool HasOutput(const std::string& name) const override {
    // has only one output
    const auto& outs = ctx_.outputs;
@@ -1207,86 +1200,7 @@ struct OperatorWithKernel::CacheImpl {
    return infer_shape_ctx_.get();
  }

-  bool updateInputsShapesDimCache() {
-    bool flag = false;
-    size_t inputs_size =
-        std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize());
-    for (size_t i = 0; i < inputs_size; i++) {
-      const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i);
-      if (!infer_shape_ctx_->HasInputs(in_name)) continue;
-      if (!inputs_dim_caches.count(in_name) ||
-          infer_shape_ctx_->GetInputsDim(in_name) !=
-              inputs_dim_caches[in_name]) {
-        inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name);
-        flag = true;
-      }
-    }
-
-#if defined(PADDLE_WITH_CUDA)
-    if (flag) discardCudaGraphCache();
-#endif
-    return flag;
-  }
-
-  bool cudaGraphEnabled(bool need_prepare_data,
-                        bool need_prepare_phi_data,
-                        const std::string& op_type) const {
-#if defined(PADDLE_WITH_CUDA)
-    return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) &&
-           !need_prepare_data && !need_prepare_phi_data;
-#else
-    return false;
-#endif
-  }
-
-  bool cacheEnabled(bool run_phi_kernel,
-                    bool need_prepare_data,
-                    bool need_prepare_phi_data,
-                    const std::string& op_type) const {
-#if defined(PADDLE_WITH_CUDA)
-    if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type))
-      return true;
-#endif
-    return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data);
-  }
-
-#if defined(PADDLE_WITH_CUDA)
-  void startCudaGraphCapture() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-  }
-
-  void endCudaGraphCapture() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-
-    cudaGraph_t graph_;
-    cudaStreamEndCapture(stream, &graph_);
-    cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0);
-    graph_generated = true;
-  }
-
-  void runCudaGraph() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-    cudaGraphLaunch(graph_instance_, stream);
-  }
-
-  bool cudaGraphGenerated() { return graph_generated; }
-
-  void discardCudaGraphCache() { graph_generated = false; }
-
- private:
-  bool graph_generated{false};
-  cudaGraphExec_t graph_instance_;
-#endif
-
 private:
-  std::map<std::string, std::vector<DDim>> inputs_dim_caches;
  std::unique_ptr<phi::KernelContext> kernel_ctx_;
  std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
 };
@@ -1496,74 +1410,8 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->Info().infer_shape_(&infer_shape_ctx);
 }

-void OperatorWithKernel::InitOpCache(const Scope& scope,
-                                     const platform::Place& place) const {
-  if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
-    std::lock_guard<std::mutex> lock(cache_update_mutex_);
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
-      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-      pre_scope_ = &scope;
-    }
-  }
-
-  impl_ =
-      new CacheImpl(new phi::KernelContext(),
-                    new RuntimeInferShapeContext(*this, *runtime_ctx_.get()));
-
-  RunImpl(scope, place, runtime_ctx_.get());
-  if (impl_->cacheEnabled(run_phi_kernel_,
-                          need_prepare_data_,
-                          need_prepare_phi_data_,
-                          Type())) {
-    impl_->updateInputsShapesDimCache();
-  }
-}
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
-  // function name: runOpCache()
-  //    effect:  reuse cacheImpl to accelerate inference period
-  auto runOpCache = [&]() {
-#if defined(PADDLE_WITH_CUDA)
-    if (impl_->cudaGraphEnabled(
-            need_prepare_data_, need_prepare_phi_data_, Type())) {
-      // cudaGraph cache
-      if (impl_->updateInputsShapesDimCache()) {
-        if (!all_kernels_must_compute_runtime_shape_)
-          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-        (*phi_kernel_)(impl_->getKernelContext());
-      } else if (!impl_->cudaGraphGenerated()) {
-        impl_->startCudaGraphCapture();
-        impl_->getKernelContext();
-        RunImpl(scope, place, runtime_ctx_.get());
-        impl_->endCudaGraphCapture();
-      } else {
-        if (!all_kernels_must_compute_runtime_shape_)
-          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-        impl_->runCudaGraph();
-      }
-      return;
-    }
-#endif
-    // common cache
-    if (!all_kernels_must_compute_runtime_shape_)
-      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-    (*phi_kernel_)(impl_->getKernelContext());
-  };
-
-  // function name: updateRuntimeContext
-  //        effect: update runtime_ctx from current scope.
-  auto updateRuntimeContext = [&](const Scope& scope) {
-    const Scope* cur_scope = &scope;
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
-    }
-  };
-
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
  // result of HasAttr.
  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
@@ -1576,18 +1424,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    RuntimeContext ctx(Inputs(), Outputs(), scope);
    RunImpl(scope, place, &ctx);
    pre_scope_ = cur_scope;
+  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
+             !need_prepare_phi_data_) {
+    if (!all_kernels_must_compute_runtime_shape_)
+      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    (*phi_kernel_)(impl_->getKernelContext());
  } else {
-    if (!impl_) {
-      InitOpCache(scope, place);
-    } else if (impl_->cacheEnabled(run_phi_kernel_,
-                                   need_prepare_data_,
-                                   need_prepare_phi_data_,
-                                   Type())) {
-      runOpCache();
-    } else {
-      updateRuntimeContext(scope);
-      RunImpl(scope, place, runtime_ctx_.get());
+    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+      std::lock_guard<std::mutex> lock(cache_update_mutex_);
+      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+        pre_scope_ = cur_scope;
+      }
    }
+    RunImpl(scope, place, runtime_ctx_.get());
  }
 }

@@ -1852,6 +1702,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      phi::KernelContext phi_kernel_context;
      if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
          !need_prepare_data_) {
+        impl_ =
+            new CacheImpl(new phi::KernelContext(),
+                          new RuntimeInferShapeContext(*this, *runtime_ctx));
        BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
        (*phi_kernel_)(impl_->getKernelContext());
      } else {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -712,7 +712,6 @@ class OperatorWithKernel : public OperatorBase {
  // used for IndicateOrPromoteVarDataTypes
  Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                   const std::string& name) const;
-  void InitOpCache(const Scope& scope, const platform::Place& place) const;

 protected:
  mutable std::unique_ptr<OpKernelType> kernel_type_;

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -165,7 +165,6 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
    "gpu_cpu_map_matmul_v2_to_matmul_pass",
    "fc_fuse_pass",
    "fc_elementwise_layernorm_fuse_pass",
-    "runtime_context_cache_pass",
 };

 const std::vector<std::string> kTrtLowerPrecisionPasses{