Support CUDA Graph on ParallelExecutor (#36250)

* support CUDA Graph on PE * add ut, fix CI compile * reduce memory consumption * fix CUDA 10 CI * improve coverage * improve python coverage

Support CUDA Graph on ParallelExecutor (#36250)
* support CUDA Graph on PE * add ut, fix CI compile * reduce memory consumption * fix CUDA 10 CI * improve coverage * improve python coverage
f9591bb1 · Zeng Jinle · GitHub · ca16e8fd · f9591bb1 · f9591bb1
18 changed file
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -143,6 +143,8 @@ struct BuildStrategy {
  // Turn off inplace addto by default.
  bool enable_addto_{false};

+  bool allow_cuda_graph_capture_{false};
+
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
  // it's distributed model.

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -86,19 +86,28 @@ struct ScaleLossGradFunctor {
  }
 };

+std::string ScaleLossGradOpHandle::LossGradName() const {
+  return static_cast<VarHandle *>(this->outputs_[0])->name();
+}
+
 void ScaleLossGradOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());
-  // Doesn't wait any event
-  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
+  RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true);
+}

-  auto *tensor =
-      local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
+void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
+  auto *tensor = var->GetMutable<LoDTensor>();
  tensor->Resize(make_ddim({1}));

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
                            this->dev_ctxes_.at(place_));
-  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
+  if (record_event) {
+    this->RunAndRecordEvent(
+        [&] { framework::VisitDataType(out_dtype_, func); });
+  } else {
+    framework::VisitDataType(out_dtype_, func);
+  }
 #else
  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
  framework::VisitDataType(out_dtype_, func);

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase {

  std::string Name() const override;

+  platform::Place GetPlace() const { return place_; }
+
+  void RunOnVar(Variable *var, bool record_event = false);
+
+  std::string LossGradName() const;
+
 protected:
  void RunImpl() override;


--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -22,7 +22,9 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/profiler.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
  PrepareLocalExeScopes();
 }

+static void RunProgramDescs(const ProgramDescs &programs,
+                            const std::vector<Scope *> &local_exec_scopes,
+                            const std::vector<platform::Place> &places) {
+  for (auto &program : programs) {
+    for (auto &op_desc : program.Block(0).AllOps()) {
+      for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
+        auto op = OpRegistry::CreateOp(*op_desc);
+        op->Run(*local_exec_scopes[i], places[i]);
+      }
+    }
+  }
+}
+
 FetchResultType ScopeBufferedSSAGraphExecutor::Run(
    const std::vector<std::string> &fetch_tensors, bool return_merged) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    strategy_.num_iteration_per_drop_scope_ =
+        std::numeric_limits<size_t>::max();
+    DropLocalExeScopes(/*need_wait=*/false);
+  }
+#endif
+
  if (drop_scope_counter_ == 0) {
    platform::RecordEvent e("InitLocalVars");
    InitVariables();
@@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
  ++drop_scope_counter_;
  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
      DropScopeOrNot()) {
-    DropLocalExeScopes();
+    DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
  }

  if (VLOG_IS_ON(5)) {
@@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
    if (graph.Has(details::kStartupProgramDescs)) {
      auto &program_descs =
          graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);
-
-      for (auto &program_desc : program_descs) {
-        for (auto &op_desc : program_desc.Block(0).AllOps()) {
-          for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-            auto op = OpRegistry::CreateOp(*op_desc);
-            op->Run(*local_exec_scopes_[i], places_[i]);
-          }
-        }
-      }
+      RunProgramDescs(program_descs, local_exec_scopes_, places_);
    }
    is_initialized_ = true;
  }
@@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
  if (graph.Has(details::kProgramDescs)) {
    auto &program_descs =
        graph.Get<details::ProgramDescs>(details::kProgramDescs);
-
-    for (auto &program_desc : program_descs) {
-      for (auto &op_desc : program_desc.Block(0).AllOps()) {
-        for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-          auto op = OpRegistry::CreateOp(*op_desc);
-          op->Run(*local_exec_scopes_[i], places_[i]);
-        }
-      }
-    }
+    RunProgramDescs(program_descs, local_exec_scopes_, places_);
  }
 }

-void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
+void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
  platform::RecordEvent drop_scope_event("DropLocalExeScopes");
  drop_scope_counter_ = 0;
-  for (auto &p : places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  if (need_wait) {
+    for (auto &p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
  }
  scope_monitor_.ClearHistoryLocalExecScopes();
  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  FetchResultType Run(const std::vector<std::string>& fetch_tensors,
                      bool return_merged) override;

-  void DropLocalExeScopes();
+  void DropLocalExeScopes(bool need_wait = true);

  bool NeedCreateLocalExeScope();


--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -115,6 +115,7 @@ message BuildStrategy {
  optional bool enable_auto_fusion = 11 [ default = false ];
  optional bool enable_addto = 12 [ default = false ];
  optional bool fix_op_run_order = 13 [ default = false ];
+  optional bool allow_cuda_graph_capture = 14 [ default = false ];
 }

 message ExecutionStrategy {

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
-cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
+cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)

 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)

--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"

@@ -21,14 +22,23 @@ namespace paddle {
 namespace framework {
 namespace ir {

+template <typename T>
+static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base,
+                                         const platform::Place &place) {
+  auto *op = dynamic_cast<T *>(op_base);
+  return op && op->GetPlace() == place;
+}
+
 static bool IsLockAndRecordEventFreeComputationOpHandle(
    details::ComputationOpHandle *op, const OpGraphView &graph_view) {
  if (!platform::is_gpu_place(op->GetPlace()) &&
      !platform::is_xpu_place(op->GetPlace()))
    return false;
  for (auto &pending_op : graph_view.PendingOps(op)) {
-    auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
-    if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
+    if (!IsMatchedPlaceSingleDeviceOp<details::ComputationOpHandle>(
+            pending_op, op->GetPlace()) &&
+        !IsMatchedPlaceSingleDeviceOp<details::ScaleLossGradOpHandle>(
+            pending_op, op->GetPlace())) {
      return false;
    }
  }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -43,6 +45,10 @@ limitations under the License. */

 DECLARE_double(eager_delete_tensor_gb);

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(sync_nccl_allreduce);
+#endif
+
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
@@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  // ncclOp
  std::vector<ir::Graph *> async_graphs =
      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
+  PrepareForCUDAGraphCapture(graph);
  graph = member_->ApplyMemoryOptimizePass(graph);
  async_graphs[0] = graph;

@@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices(
 FetchResultType ParallelExecutor::Run(
    const std::vector<std::string> &fetch_tensors, bool return_merged) {
  VLOG(3) << "enter ParallelExecutor Run";
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "Cannot fetch data when using CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        member_->build_strategy_.allow_cuda_graph_capture_, true,
+        platform::errors::InvalidArgument(
+            "You must turn on build_strategy.allow_cuda_graph_capture = True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        member_->places_[0], platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+
 #ifdef WITH_GPERFTOOLS
  if (gProfileStarted) {
    ProfilerFlush();
@@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse(

 void ParallelExecutor::FeedTensorsIntoLocalScopes(
    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  if (platform::IsCUDAGraphCapturing()) {
+    for (auto &tensor : tensors) {
+      PADDLE_ENFORCE_EQ(
+          tensor.empty(), true,
+          platform::errors::PermissionDenied(
+              "Feeding data is not permitted when capturing CUDA Graph."));
+    }
+    return;
+  }
+
  if (!member_->AllowPartialFeed()) {
    PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(),
                      platform::errors::Unimplemented(
@@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(

 void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
    const std::unordered_map<std::string, LoDTensor> &tensors) {
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        tensors.empty(), true,
+        platform::errors::PermissionDenied(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    return;
+  }
+
  size_t num_places = member_->places_.size();
  bool allow_partial_feed = member_->AllowPartialFeed();

@@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const {
  return member_->executor_->Graph();
 }

+void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
+  const auto &build_strategy = member_->build_strategy_;
+  if (!build_strategy.allow_cuda_graph_capture_) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      build_strategy.async_mode_, false,
+      platform::errors::InvalidArgument(
+          "Async Executor does not support CUDA Graph capturing."));
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(), false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "when running the first batch."));
+  PADDLE_ENFORCE_EQ(
+      member_->places_.size(), 1,
+      platform::errors::InvalidArgument(
+          "CUDA Graph is only supported when one GPU device is running."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  std::unordered_map<std::string, std::vector<VarDesc *>> all_vars;
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      auto *var_desc = node->Var();
+      all_vars[var_desc->Name()].emplace_back(var_desc);
+    }
+  }
+
+  auto mark_var_as_persistable = [&all_vars](const std::string &name) {
+    auto iter = all_vars.find(name);
+    if (iter != all_vars.end()) {
+      for (auto *var_desc : iter->second) {
+        var_desc->SetPersistable(true);
+      }
+    }
+  };
+
+  // Step 1: All fused vars must be persistable.
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      fused_var.second.persistable_ = true;
+      mark_var_as_persistable(fused_var.first);
+    }
+  }
+
+  // Step 2: All pinned vars must be persistable.
+  if (graph->Has(details::kPinnedVars)) {
+    auto &pinned_vars = graph->Get<details::PinnedVars>(details::kPinnedVars);
+    for (auto &pinned_var : pinned_vars) {
+      mark_var_as_persistable(pinned_var);
+    }
+  }
+
+  // Step 3: Move all main programs to startup programs to make sure that
+  // the main programs would only be run once.
+  if (graph->Has(details::kProgramDescs)) {
+    auto &startup_programs =
+        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
+    auto &main_programs =
+        graph->Get<details::ProgramDescs>(details::kProgramDescs);
+    for (auto &main_program : main_programs) {
+      startup_programs.emplace_back(main_program);
+    }
+    graph->Erase(details::kProgramDescs);
+  }
+
+  // Step 4: Mark all vars in startup programs to be persistable.
+  if (graph->Has(details::kStartupProgramDescs)) {
+    auto &startup_programs =
+        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
+    for (auto &startup_program : startup_programs) {
+      for (auto &op_desc : startup_program.Block(0).AllOps()) {
+        for (auto &output : op_desc->OutputArgumentNames()) {
+          mark_var_as_persistable(output);
+        }
+      }
+    }
+  }
+
+  // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy.
+  auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
+  auto *scope = member_->local_scopes_[0];
+  for (auto *op : ops) {
+    auto *loss_grad_op = dynamic_cast<details::ScaleLossGradOpHandle *>(op);
+    if (loss_grad_op == nullptr) continue;
+    auto loss_grad_name = loss_grad_op->LossGradName();
+    mark_var_as_persistable(loss_grad_name);
+    loss_grad_op->RunOnVar(scope->Var(loss_grad_name));
+    loss_grad_op->SetSkipRunning(true);
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle


--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -144,6 +144,8 @@ class ParallelExecutor {
  void SetReaderOpDeviceInfoOfGraphs(
      const std::vector<ir::Graph *> &final_graphs);

+  void PrepareForCUDAGraphCapture(ir::Graph *graph);
+
  ParallelExecutorPrivate *member_;
  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
  std::vector<VariableInfo> var_infos_;

--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 namespace paddle {
 namespace operators {
@@ -480,6 +481,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                     bool deterministic,
                     const framework::ExecutionContext& ctx) {
+    platform::CUDAGraphCaptureModeGuard guard;
    auto dtype = platform::CudnnDataType<T>::type;
    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
    size_t workspace_size = 0;
@@ -601,6 +603,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
  }

  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+    platform::CUDAGraphCaptureModeGuard guard;
    size_t workspace_size = 0;
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(

--- a/paddle/fluid/platform/cuda_graph.cc
+++ b/paddle/fluid/platform/cuda_graph.cc
@@ -70,6 +70,9 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
  cudaStreamCaptureStatus status;
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo(
      capturing_graph_->stream_, &status, &(capturing_graph_->id_)));
+  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
+                    platform::errors::PermissionDenied(
+                        "CUDA Graph should not be invalidated."));
  VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_;
 }

@@ -88,5 +91,14 @@ std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
 #endif
 }

+bool CUDAGraph::IsValidCapturing() {
+  if (!IsCapturing()) return false;
+  cudaStreamCaptureStatus status;
+  CUDAGraphID id;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
+  return status == cudaStreamCaptureStatusActive;
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/cuda_graph.h
+++ b/paddle/fluid/platform/cuda_graph.h
@@ -84,6 +84,10 @@ class CUDAGraph {
    return capturing_graph_->place_;
  }

+  // This API can be used to debug which GPU operation is not
+  // supported during capturing CUDA Graph.
+  static bool IsValidCapturing();
+
 private:
 #if CUDA_VERSION >= 10010
  cudaGraph_t graph_{nullptr};
@@ -104,7 +108,8 @@ class CUDAGraphCaptureModeGuard {
  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

 public:
-  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) {
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
    if (UNLIKELY(CUDAGraph::IsCapturing())) {
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
      // After cudaThreadExchangeStreamCaptureMode is called,
@@ -128,7 +133,8 @@ class CUDAGraphCaptureModeGuard {
  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);

 public:
-  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {}
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
 };
 #endif


--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -22,8 +22,10 @@ namespace platform {
 #ifdef PADDLE_WITH_CUDA
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
                           cudaStreamCaptureMode mode) {
-  auto stream =
-      platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
+  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+
+  auto stream = dev_ctx->stream();
  CUDAGraph::BeginCapture(place, stream, mode);
  auto id = CUDAGraph::CapturingID();
  memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
@@ -35,6 +37,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 }

 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
+  auto place = CUDAGraph::CapturingPlace();
+  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
  return CUDAGraph::EndCapture();
 }
 #endif

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -558,7 +558,7 @@ class RecordedCudaMallocHelper {
 #ifdef PADDLE_WITH_HIP
    auto result = hipMalloc(ptr, size);
 #else
-    CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed};
+    CUDAGraphCaptureModeGuard capture_mode_guard;
    auto result = cudaMalloc(ptr, size);
 #endif
    if (result == gpuSuccess) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -736,6 +736,17 @@ PYBIND11_MODULE(core_noavx, m) {
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
           })
+      .def("_copy_from",
+           [](framework::Tensor &self, const framework::Tensor &other,
+              const platform::Place &place, int64_t batch_size) {
+             if (batch_size < 0) {
+               framework::TensorCopy(other, place, &self);
+             } else {
+               auto sliced = other.Slice(0, batch_size);
+               framework::TensorCopy(sliced, place, &self);
+             }
+           },
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
      .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
@@ -2299,7 +2310,14 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
-  m.def("cuda_empty_cache", platform::EmptyCache);
+  m.def("cuda_empty_cache", [] {
+    for (int dev_id : platform::GetSelectedDevices()) {
+      auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
+          platform::CUDAPlace(dev_id));
+      dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+    }
+    platform::EmptyCache();
+  });
  m.def("get_device_properties",
        [](int id) -> const gpuDeviceProp & {
          return platform::GetDeviceProperties(id);
@@ -3211,6 +3229,13 @@ All parameter, weight, gradient are variables in Paddle.
          [](BuildStrategy &self, bool fix_op_run_order) {
            self.fix_op_run_order_ = fix_op_run_order;
          })
+      .def_property("allow_cuda_graph_capture",
+                    [](const BuildStrategy &self) {
+                      return self.allow_cuda_graph_capture_;
+                    },
+                    [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+                      self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+                    })
      .def("_copy",
           [](const BuildStrategy &self) {
             auto new_bs = self;

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1044,9 +1044,15 @@ class Executor(object):
            lr_value = lr_sheduler()
            lr_var = program._program.global_block().vars[lr_sheduler._var_name]
            lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
-            exe.feed_and_split_tensor_into_local_scopes({
-                lr_sheduler._var_name: lr_tensor
-            })
+            if core.is_cuda_graph_capturing():
+                warnings.warn(
+                    "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not "
+                    "take any effect! Please set the learning rate manually before each batch!"
+                )
+            else:
+                exe.feed_and_split_tensor_into_local_scopes({
+                    lr_sheduler._var_name: lr_tensor
+                })

        fetch_var_names = list(map(_to_name_str, fetch_list))
        tensors = exe.run(fetch_var_names, return_merged)._move_to_list()

--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -17,18 +17,105 @@ import paddle.fluid as fluid
 from paddle.device.cuda.graphs import CUDAGraph
 import unittest
 import numpy as np
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from simple_nets import simple_fc_net_with_inputs


 class TestCUDAGraph(unittest.TestCase):
    def setUp(self):
-        fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'})
+        if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(
+        ):
+            fluid.set_flags({
+                'FLAGS_allocator_strategy': 'auto_growth',
+                'FLAGS_sync_nccl_allreduce': False,
+                'FLAGS_cudnn_deterministic': True
+            })

    def random_tensor(self, shape):
        return paddle.to_tensor(
            np.random.randint(
                low=0, high=10, size=shape).astype("float32"))

-    def test_cuda_graph(self):
+    @switch_to_static_graph
+    def test_cuda_graph_static_graph(self):
+        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
+            return
+
+        seed = 100
+        loss_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=True)
+        loss_no_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=False)
+        self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
+
+    def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
+        batch_size = 1
+        class_num = 10
+        image_shape = [batch_size, 784]
+        label_shape = [batch_size, 1]
+
+        paddle.seed(seed)
+        np.random.seed(seed)
+        startup = paddle.static.Program()
+        main = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            image = paddle.static.data(
+                name="image", shape=image_shape, dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=label_shape, dtype='int64')
+            image.persistable = True
+            label.persistable = True
+            loss = simple_fc_net_with_inputs(image, label, class_num)
+            loss.persistable = True
+            lr = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04])
+            optimizer = paddle.optimizer.SGD(learning_rate=lr)
+            optimizer.minimize(loss)
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(startup)
+            build_strategy = paddle.static.BuildStrategy()
+            build_strategy.allow_cuda_graph_capture = True
+            build_strategy.fix_op_run_order = True
+            build_strategy.fuse_all_optimizer_ops = True
+            compiled_program = paddle.static.CompiledProgram(
+                main).with_data_parallel(
+                    loss_name=loss.name,
+                    build_strategy=build_strategy,
+                    places=place)
+            image_t = scope.var(image.name).get_tensor()
+            label_t = scope.var(label.name).get_tensor()
+            loss_t = scope.var(loss.name).get_tensor()
+            lr_var = main.global_block().var(lr._var_name)
+            self.assertTrue(lr_var.persistable)
+            lr_t = scope.var(lr_var.name).get_tensor()
+            cuda_graph = None
+            for batch_id in range(20):
+                image_t.set(
+                    np.random.rand(*image_shape).astype('float32'), place)
+                label_t.set(np.random.randint(
+                    low=0, high=class_num, size=label_shape, dtype='int64'),
+                            place)
+
+                if batch_id == 1 and use_cuda_graph:
+                    cuda_graph = CUDAGraph(place, mode="global")
+                    cuda_graph.capture_begin()
+                    exe.run(compiled_program)
+                    cuda_graph.capture_end()
+
+                if cuda_graph:
+                    lr_t.set(np.array([lr()], dtype='float32'), place)
+                    cuda_graph.replay()
+                else:
+                    exe.run(compiled_program)
+                lr.step()
+            if cuda_graph:
+                cuda_graph.reset()
+        return np.array(loss_t)
+
+    def test_cuda_graph_dynamic_graph(self):
        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
            return