From b0e7681fc99ca3fc6f2677f6c7054d84fad013dd Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Sun, 14 Aug 2022 21:13:44 +0800
Subject: [PATCH] Revert "[Paddle Inference] Support cuda_graph. (#44878)"
 (#45115)

This reverts commit 84bf5c313d112acbb96d93bbe686afc4101bdb85.
---
 paddle/fluid/framework/inference_cached_ops.h |  29 ---
 paddle/fluid/framework/operator.cc            | 177 ++----------------
 paddle/fluid/framework/operator.h             |   1 -
 .../inference/api/paddle_pass_builder.cc      |   1 -
 4 files changed, 15 insertions(+), 193 deletions(-)
 delete mode 100644 paddle/fluid/framework/inference_cached_ops.h
diff --git a/paddle/fluid/framework/inference_cached_ops.h b/paddle/fluid/framework/inference_cached_ops.h
deleted file mode 100644
index 50444e18071..00000000000
--- a/paddle/fluid/framework/inference_cached_ops.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-
-// cached ops will be captured to accelerate gpu performance.
-//      1. op will generate a cudaGraph to record inner gpu kernels
-//      2. inner gpu kernels can be launched by calling the cudagraphExecutor
-//      only once.
-std::vector<std::string> cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c2a66512676..b4ef3efb821 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/inference_cached_ops.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -710,12 +709,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return in[0] != nullptr;
   }
 
-  size_t InputsSize() const {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    return op_proto->inputs().size();
-  }
-
   bool HasOutput(const std::string& name) const override {
     // has only one output
     const auto& outs = ctx_.outputs;
@@ -1207,86 +1200,7 @@ struct OperatorWithKernel::CacheImpl {
     return infer_shape_ctx_.get();
   }
 
-  bool updateInputsShapesDimCache() {
-    bool flag = false;
-    size_t inputs_size =
-        std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize());
-    for (size_t i = 0; i < inputs_size; i++) {
-      const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i);
-      if (!infer_shape_ctx_->HasInputs(in_name)) continue;
-      if (!inputs_dim_caches.count(in_name) ||
-          infer_shape_ctx_->GetInputsDim(in_name) !=
-              inputs_dim_caches[in_name]) {
-        inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name);
-        flag = true;
-      }
-    }
-
-#if defined(PADDLE_WITH_CUDA)
-    if (flag) discardCudaGraphCache();
-#endif
-    return flag;
-  }
-
-  bool cudaGraphEnabled(bool need_prepare_data,
-                        bool need_prepare_phi_data,
-                        const std::string& op_type) const {
-#if defined(PADDLE_WITH_CUDA)
-    return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) &&
-           !need_prepare_data && !need_prepare_phi_data;
-#else
-    return false;
-#endif
-  }
-
-  bool cacheEnabled(bool run_phi_kernel,
-                    bool need_prepare_data,
-                    bool need_prepare_phi_data,
-                    const std::string& op_type) const {
-#if defined(PADDLE_WITH_CUDA)
-    if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type))
-      return true;
-#endif
-    return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data);
-  }
-
-#if defined(PADDLE_WITH_CUDA)
-  void startCudaGraphCapture() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-  }
-
-  void endCudaGraphCapture() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-
-    cudaGraph_t graph_;
-    cudaStreamEndCapture(stream, &graph_);
-    cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0);
-    graph_generated = true;
-  }
-
-  void runCudaGraph() {
-    phi::GPUContext* ctx = static_cast<phi::GPUContext*>(
-        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-    auto stream = ctx->stream();
-    cudaGraphLaunch(graph_instance_, stream);
-  }
-
-  bool cudaGraphGenerated() { return graph_generated; }
-
-  void discardCudaGraphCache() { graph_generated = false; }
-
- private:
-  bool graph_generated{false};
-  cudaGraphExec_t graph_instance_;
-#endif
-
  private:
-  std::map<std::string, std::vector<DDim>> inputs_dim_caches;
   std::unique_ptr<phi::KernelContext> kernel_ctx_;
   std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
 };
@@ -1496,74 +1410,8 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->Info().infer_shape_(&infer_shape_ctx);
 }
 
-void OperatorWithKernel::InitOpCache(const Scope& scope,
-                                     const platform::Place& place) const {
-  if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
-    std::lock_guard<std::mutex> lock(cache_update_mutex_);
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) {
-      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-      pre_scope_ = &scope;
-    }
-  }
-
-  impl_ =
-      new CacheImpl(new phi::KernelContext(),
-                    new RuntimeInferShapeContext(*this, *runtime_ctx_.get()));
-
-  RunImpl(scope, place, runtime_ctx_.get());
-  if (impl_->cacheEnabled(run_phi_kernel_,
-                          need_prepare_data_,
-                          need_prepare_phi_data_,
-                          Type())) {
-    impl_->updateInputsShapesDimCache();
-  }
-}
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  // function name: runOpCache()
-  //    effect:  reuse cacheImpl to accelerate inference period
-  auto runOpCache = [&]() {
-#if defined(PADDLE_WITH_CUDA)
-    if (impl_->cudaGraphEnabled(
-            need_prepare_data_, need_prepare_phi_data_, Type())) {
-      // cudaGraph cache
-      if (impl_->updateInputsShapesDimCache()) {
-        if (!all_kernels_must_compute_runtime_shape_)
-          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-        (*phi_kernel_)(impl_->getKernelContext());
-      } else if (!impl_->cudaGraphGenerated()) {
-        impl_->startCudaGraphCapture();
-        impl_->getKernelContext();
-        RunImpl(scope, place, runtime_ctx_.get());
-        impl_->endCudaGraphCapture();
-      } else {
-        if (!all_kernels_must_compute_runtime_shape_)
-          this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-        impl_->runCudaGraph();
-      }
-      return;
-    }
-#endif
-    // common cache
-    if (!all_kernels_must_compute_runtime_shape_)
-      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-    (*phi_kernel_)(impl_->getKernelContext());
-  };
-
-  // function name: updateRuntimeContext
-  //        effect: update runtime_ctx from current scope.
-  auto updateRuntimeContext = [&](const Scope& scope) {
-    const Scope* cur_scope = &scope;
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
-    }
-  };
-
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
   // result of HasAttr.
   if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
@@ -1576,18 +1424,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
     pre_scope_ = cur_scope;
+  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
+             !need_prepare_phi_data_) {
+    if (!all_kernels_must_compute_runtime_shape_)
+      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    (*phi_kernel_)(impl_->getKernelContext());
   } else {
-    if (!impl_) {
-      InitOpCache(scope, place);
-    } else if (impl_->cacheEnabled(run_phi_kernel_,
-                                   need_prepare_data_,
-                                   need_prepare_phi_data_,
-                                   Type())) {
-      runOpCache();
-    } else {
-      updateRuntimeContext(scope);
-      RunImpl(scope, place, runtime_ctx_.get());
+    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+      std::lock_guard<std::mutex> lock(cache_update_mutex_);
+      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+        pre_scope_ = cur_scope;
+      }
     }
+    RunImpl(scope, place, runtime_ctx_.get());
   }
 }
 
@@ -1852,6 +1702,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       phi::KernelContext phi_kernel_context;
       if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
           !need_prepare_data_) {
+        impl_ =
+            new CacheImpl(new phi::KernelContext(),
+                          new RuntimeInferShapeContext(*this, *runtime_ctx));
         BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
         (*phi_kernel_)(impl_->getKernelContext());
       } else {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 4185b450c7a..17ec9a1f93e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -712,7 +712,6 @@ class OperatorWithKernel : public OperatorBase {
   // used for IndicateOrPromoteVarDataTypes
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
-  void InitOpCache(const Scope& scope, const platform::Place& place) const;
 
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 739c1466353..6119714c38c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -165,7 +165,6 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
     "gpu_cpu_map_matmul_v2_to_matmul_pass",
     "fc_fuse_pass",
     "fc_elementwise_layernorm_fuse_pass",
-    "runtime_context_cache_pass",
 };
 
 const std::vector<std::string> kTrtLowerPrecisionPasses{
-- 
GitLab