[PHI decouple] move dropout_impl and cuda_graph_with_memory_pool from fluid to phi (#49139)

* move dropout_impl from fluid to phi * move cuda_graph_with_memory_pool from fluid to phi * update namespace * remove cuad_graph in fluid * fix mac-build * fix bugs * correct CodeStyle * fix mac-build * fix mutable_data * fix stl include * fix copy param

[PHI decouple] move dropout_impl and cuda_graph_with_memory_pool from fluid to phi (#49139)
* move dropout_impl from fluid to phi * move cuda_graph_with_memory_pool from fluid to phi * update namespace * remove cuad_graph in fluid * fix mac-build * fix bugs * correct CodeStyle * fix mac-build * fix mutable_data * fix stl include * fix copy param
579784e2 · huangjiyi · GitHub · 44973c65 · 579784e2 · 579784e2
19 changed file
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,7 +39,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"

 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif

 #if CUDA_VERSION >= 10020
@@ -157,7 +157,7 @@ class CUDAGraphAllocator

 static bool IsCUDAGraphCapturing() {
 #ifdef PADDLE_WITH_CUDA
-  return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
+  return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
 #else
  return false;
 #endif
@@ -1007,7 +1007,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 #ifdef PADDLE_WITH_CUDA
  if (UNLIKELY(IsCUDAGraphCapturing())) {
-    auto id = platform::CUDAGraph::CapturingPoolID();
+    auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
        iter,

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"

 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif

 namespace paddle {
@@ -49,7 +49,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {

  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
 #ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+  if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
    graph_capturing_stream_set_.insert(stream);
    return;
  }
@@ -61,7 +61,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {

 bool StreamSafeCUDAAllocation::CanBeFreed() {
 #ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+  if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
    return graph_capturing_stream_set_.empty() &&
           outstanding_event_map_.empty();
  }

--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -319,7 +319,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
        data, result, data_num_);
    RecordStream(data_allocation, other_stream);

-    std::unique_ptr<platform::CUDAGraph> cuda_graph =
+    std::unique_ptr<phi::backends::gpu::CUDAGraph> cuda_graph =
        platform::EndCUDAGraphCapture();

    int replay_times = 10;

--- a/paddle/fluid/operators/cuda_graph_with_in_out.h
+++ b/paddle/fluid/operators/cuda_graph_with_in_out.h
@@ -89,7 +89,7 @@ class CUDAGraphWithInOuts {
  int64_t PoolID() const { return graph_->PoolID(); }

 private:
-  std::unique_ptr<platform::CUDAGraph> graph_;
+  std::unique_ptr<phi::backends::gpu::CUDAGraph> graph_;
  std::vector<phi::DenseTensor> ins_;
  std::vector<phi::DenseTensor> outs_;
  std::vector<int64_t> in_indices_;

--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -14,10 +14,10 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
@@ -206,7 +206,7 @@ class FMHARef {
    stride_b = gemm_k * gemm_n;

    if (dropout_param_.dropout_prob_) {
-      DropoutFwGPUKernelDriver<T>(
+      phi::funcs::DropoutFwGPUKernelDriver<T>(
          static_cast<const phi::GPUContext&>(dev_ctx_),
          dropout_param_.is_test_,
          dropout_param_.dropout_prob_,
@@ -381,7 +381,7 @@ class FMHARef {
    stride_b = gemm_k * gemm_n;

    if (dropout_param_.dropout_prob_) {
-      DropoutFwGPUKernelDriver<T>(
+      phi::funcs::DropoutFwGPUKernelDriver<T>(
          static_cast<const phi::GPUContext&>(dev_ctx_),
          dropout_param_.is_test_,
          dropout_param_.dropout_prob_,
@@ -552,7 +552,7 @@ class FMHARef {
    }
    // dropout bw
    if (dropout_param_.dropout_prob_) {
-      DropoutGradGPUKernelDriver<T>(
+      phi::funcs::DropoutGradGPUKernelDriver<T>(
          static_cast<const phi::GPUContext&>(dev_ctx_),
          false,
          dropout_param_.dropout_prob_,

--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once

 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/layer_norm_kernel.h"

@@ -106,7 +106,7 @@ struct DropoutParam {

  int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
    uint64_t tmp_increment;
-    GetSeedDataAndIncrement(
+    phi::funcs::GetSeedDataAndIncrement(
        ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
    increment = static_cast<int>(tmp_increment);
    return increment;

--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"

 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"

 DECLARE_bool(use_stream_safe_cuda_allocator);

@@ -23,10 +23,10 @@ namespace paddle {
 namespace platform {

 #ifdef PADDLE_WITH_CUDA
-void BeginCUDAGraphCapture(platform::CUDAPlace place,
+void BeginCUDAGraphCapture(phi::GPUPlace place,
                           cudaStreamCaptureMode mode,
                           int64_t pool_id) {
-  auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
  dev_ctx->cudnn_workspace_handle().ResetWorkspace();

@@ -64,7 +64,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,

 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
  auto place = CUDAGraph::CapturingPlace();
-  auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
  dev_ctx->SetCUDAGraphAllocator(nullptr);

--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -14,123 +14,38 @@

 #pragma once

-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
-#endif
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"

 namespace paddle {
 namespace platform {

-#ifdef PADDLE_WITH_CUDA
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond,                            \
-                                           __kernel_func,                     \
-                                           __grid,                            \
-                                           __block,                           \
-                                           __sm_size,                         \
-                                           __stream,                          \
-                                           __seed_inc,                        \
-                                           __seed_expr,                       \
-                                           __offset_expr,                     \
-                                           ...)                               \
-  do {                                                                        \
-    if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
-      using __Helper =                                                        \
-          ::paddle::platform::IsSameKernelHelper<decltype(&__kernel_func),    \
-                                                 &__kernel_func>;             \
-      auto *dev_ctx =                                                         \
-          ::paddle::platform::DeviceContextPool::Instance().GetByPlace(       \
-              ::paddle::platform::CUDAGraph::CapturingPlace());               \
-      auto __set_seed_func =                                                  \
-          [=](::paddle::platform::CUDAKernelParams *__params,                 \
-              bool __check_only) -> bool {                                    \
-        if (__check_only) {                                                   \
-          return __params->func() == &__kernel_func &&                        \
-                 __Helper::Compare(*__params, __VA_ARGS__);                   \
-        }                                                                     \
-        auto &KERNEL_PARAMS = *__params;                                      \
-        uint64_t __seed, __offset;                                            \
-        ::paddle::operators::GetSeedDataAndIncrement(                         \
-            *dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset);     \
-        __seed_expr = static_cast<decltype(__seed_expr)>(__seed);             \
-        __offset_expr = static_cast<decltype(__offset_expr)>(__offset);       \
-        return true;                                                          \
-      };                                                                      \
-      ::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \
-    }                                                                         \
-    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);     \
-  } while (0)
-#else
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond,                        \
-                                           __kernel_func,                 \
-                                           __grid,                        \
-                                           __block,                       \
-                                           __sm_size,                     \
-                                           __stream,                      \
-                                           __seed_inc,                    \
-                                           __seed_expr,                   \
-                                           __offset_expr,                 \
-                                           ...)                           \
-  do {                                                                    \
-    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
-  } while (0)
-#endif
-
 // NOTE: These APIs are not thread-safe.
 #ifdef PADDLE_WITH_CUDA
-void BeginCUDAGraphCapture(platform::CUDAPlace place,
+using CUDAGraph = phi::backends::gpu::CUDAGraph;
+
+void BeginCUDAGraphCapture(phi::GPUPlace place,
                           cudaStreamCaptureMode mode,
                           int64_t pool_id = CUDAGraph::kInvalidPoolID);
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
 #endif

-inline bool IsCUDAGraphCapturing() {
-#ifdef PADDLE_WITH_CUDA
-  return CUDAGraph::IsCapturing();
-#else
-  return false;
-#endif
-}
-
-inline platform::CUDAPlace CUDAGraphCapturingPlace() {
+inline phi::GPUPlace CUDAGraphCapturingPlace() {
 #ifdef PADDLE_WITH_CUDA
  return CUDAGraph::CapturingPlace();
 #else
-  PADDLE_THROW(platform::errors::Unimplemented(
+  PADDLE_THROW(phi::errors::Unimplemented(
      "CUDA Graph is only supported on NVIDIA GPU device."));
 #endif
 }

-// Add reset callback if CUDA Graph is capturing.
-// Otherwise, invoke callback directly.
-template <typename Callback>
-inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(IsCUDAGraphCapturing())) {
-    return CUDAGraph::AddResetCallbackDuringCapturing(
-        std::forward<Callback>(callback));
-  }
-#endif
-  callback();
-}
+using phi::backends::gpu::IsCUDAGraphCapturing;

-template <typename T>
-inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
-  static_assert(std::is_trivial<T>::value, "T must be trivial type");
-  static_assert(!std::is_same<T, void>::value, "T cannot be void");
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(IsCUDAGraphCapturing())) {
-    size_t nbytes = size * sizeof(T);
-    void *new_host_mem = new uint8_t[nbytes];
-    std::memcpy(new_host_mem, host_mem, nbytes);
-    AddResetCallbackIfCapturingCUDAGraph(
-        [new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
-    return reinterpret_cast<T *>(new_host_mem);
-  }
-#endif
-  return host_mem;
-}
+using phi::backends::gpu::AddResetCallbackIfCapturingCUDAGraph;
+
+using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph;

 class SkipCUDAGraphCaptureGuard {
  DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
-
-namespace paddle {
-namespace platform {
-
-using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
-#if CUDA_VERSION < 10010
-using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
-#endif
-using CUDAGraph = phi::backends::gpu::CUDAGraph;
-using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;
-
-template <typename T>
-static bool IsBitwiseEqual(const T &x, const T &y) {
-  return std::memcmp(&x, &y, sizeof(T)) == 0;
-}
-
-template <typename F, F f>
-struct IsSameKernelHelper;
-
-template <typename Return,
-          typename... FuncArgs,
-          Return (*kernel_fn)(FuncArgs...)>
-struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
- private:
-  using FuncArgsTuple = decltype(std::make_tuple(std::declval<FuncArgs>()...));
-
-  template <typename TupleT, size_t IDX, bool IsEnd /*=false*/>
-  struct Impl {
-    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
-      using CompareT = typename std::tuple_element<IDX, FuncArgsTuple>::type;
-      if (!IsBitwiseEqual<CompareT>(params.As<CompareT>(IDX),
-                                    std::get<IDX>(args))) {
-        return false;
-      }
-
-      constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size<TupleT>::value);
-      return Impl<TupleT, IDX + 1, NewIsEnd>::Compare(params, args);
-    }
-  };
-
-  template <typename TupleT, size_t IDX>
-  struct Impl<TupleT, IDX, true> {
-    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
-      return true;
-    }
-  };
-
- public:
-  template <typename... Args>
-  static bool Compare(const CUDAKernelParams &params, Args... args) {
-    constexpr auto kNumArgs = sizeof...(FuncArgs);
-    static_assert(kNumArgs == sizeof...(Args), "Argument number not match");
-
-    auto args_tuple = std::make_tuple(args...);
-    using TupleT = typename std::decay<decltype(args_tuple)>::type;
-    return Impl<TupleT, 0, kNumArgs == 0>::Compare(params, args_tuple);
-  }
-};
-
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -36,8 +36,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #else
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif

 #ifdef PADDLE_WITH_CUDA
@@ -230,7 +230,7 @@ class RecordedGpuMallocHelper {
      result = hipMalloc(ptr, size);
    }
 #else
-    CUDAGraphCaptureModeGuard capture_mode_guard;
+    phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
    if (UNLIKELY(malloc_managed_memory)) {
      result = cudaMallocManaged(ptr, size);
    } else {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -673,7 +673,7 @@ PYBIND11_MODULE(libpaddle, m) {

  m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing);
 #ifdef PADDLE_WITH_CUDA
-  py::class_<platform::CUDAGraph>(m, "CUDAGraph")
+  py::class_<phi::backends::gpu::CUDAGraph>(m, "CUDAGraph")
      .def_static("begin_capture",
                  [](platform::CUDAPlace place, int mode) {
                    platform::BeginCUDAGraphCapture(
@@ -681,10 +681,11 @@ PYBIND11_MODULE(libpaddle, m) {
                  })
      .def_static("end_capture", &platform::EndCUDAGraphCapture)
      .def_static("gen_new_memory_pool_id",
-                  &platform::CUDAGraph::UniqueMemoryPoolID)
-      .def("replay", &platform::CUDAGraph::Replay)
-      .def("reset", &platform::CUDAGraph::Reset)
-      .def("print_to_dot_files", &platform::CUDAGraph::PrintToDotFiles);
+                  &phi::backends::gpu::CUDAGraph::UniqueMemoryPoolID)
+      .def("replay", &phi::backends::gpu::CUDAGraph::Replay)
+      .def("reset", &phi::backends::gpu::CUDAGraph::Reset)
+      .def("print_to_dot_files",
+           &phi::backends::gpu::CUDAGraph::PrintToDotFiles);
 #endif

  m.def("wait_device", [](const platform::Place &place) {

--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -236,6 +236,54 @@ class CUDAGraphCaptureModeGuard {
 };
 #endif

+template <typename T>
+static bool IsBitwiseEqual(const T &x, const T &y) {
+  return std::memcmp(&x, &y, sizeof(T)) == 0;
+}
+
+template <typename F, F f>
+struct IsSameKernelHelper;
+
+template <typename Return,
+          typename... FuncArgs,
+          Return (*kernel_fn)(FuncArgs...)>
+struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
+ private:
+  using FuncArgsTuple = decltype(std::make_tuple(std::declval<FuncArgs>()...));
+
+  template <typename TupleT, size_t IDX, bool IsEnd /*=false*/>
+  struct Impl {
+    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
+      using CompareT = typename std::tuple_element<IDX, FuncArgsTuple>::type;
+      if (!IsBitwiseEqual<CompareT>(params.As<CompareT>(IDX),
+                                    std::get<IDX>(args))) {
+        return false;
+      }
+
+      constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size<TupleT>::value);
+      return Impl<TupleT, IDX + 1, NewIsEnd>::Compare(params, args);
+    }
+  };
+
+  template <typename TupleT, size_t IDX>
+  struct Impl<TupleT, IDX, true> {
+    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
+      return true;
+    }
+  };
+
+ public:
+  template <typename... Args>
+  static bool Compare(const CUDAKernelParams &params, Args... args) {
+    constexpr auto kNumArgs = sizeof...(FuncArgs);
+    static_assert(kNumArgs == sizeof...(Args), "Argument number not match");
+
+    auto args_tuple = std::make_tuple(args...);
+    using TupleT = typename std::decay<decltype(args_tuple)>::type;
+    return Impl<TupleT, 0, kNumArgs == 0>::Compare(params, args_tuple);
+  }
+};
+
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
--- a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <utility>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
+#endif
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#ifdef PADDLE_WITH_CUDA
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond,                           \
+                                           __kernel_func,                    \
+                                           __grid,                           \
+                                           __block,                          \
+                                           __sm_size,                        \
+                                           __stream,                         \
+                                           __seed_inc,                       \
+                                           __seed_expr,                      \
+                                           __offset_expr,                    \
+                                           ...)                              \
+  do {                                                                       \
+    if (::phi::backends::gpu::CUDAGraph::IsThisThreadCapturing() &&          \
+        (__cond)) {                                                          \
+      using __Helper =                                                       \
+          ::phi::backends::gpu::IsSameKernelHelper<decltype(&__kernel_func), \
+                                                   &__kernel_func>;          \
+      auto *dev_ctx = ::phi::DeviceContextPool::Instance().GetByPlace(       \
+          ::phi::backends::gpu::CUDAGraph::CapturingPlace());                \
+      auto __set_seed_func =                                                 \
+          [=](::phi::backends::gpu::CUDAKernelParams *__params,              \
+              bool __check_only) -> bool {                                   \
+        if (__check_only) {                                                  \
+          return __params->func() == &__kernel_func &&                       \
+                 __Helper::Compare(*__params, __VA_ARGS__);                  \
+        }                                                                    \
+        auto &KERNEL_PARAMS = *__params;                                     \
+        uint64_t __seed, __offset;                                           \
+        ::phi::funcs::GetSeedDataAndIncrement(                               \
+            *dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset);    \
+        __seed_expr = static_cast<decltype(__seed_expr)>(__seed);            \
+        __offset_expr = static_cast<decltype(__offset_expr)>(__offset);      \
+        return true;                                                         \
+      };                                                                     \
+      ::phi::backends::gpu::CUDAGraph::RecordRandomKernelInfo(               \
+          __set_seed_func);                                                  \
+    }                                                                        \
+    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);    \
+  } while (0)
+#else
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond,                        \
+                                           __kernel_func,                 \
+                                           __grid,                        \
+                                           __block,                       \
+                                           __sm_size,                     \
+                                           __stream,                      \
+                                           __seed_inc,                    \
+                                           __seed_expr,                   \
+                                           __offset_expr,                 \
+                                           ...)                           \
+  do {                                                                    \
+    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
+  } while (0)
+#endif
+
+inline bool IsCUDAGraphCapturing() {
+#ifdef PADDLE_WITH_CUDA
+  return CUDAGraph::IsCapturing();
+#else
+  return false;
+#endif
+}
+
+// Add reset callback if CUDA Graph is capturing.
+// Otherwise, invoke callback directly.
+template <typename Callback>
+inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
+    return CUDAGraph::AddResetCallbackDuringCapturing(
+        std::forward<Callback>(callback));
+  }
+#endif
+  callback();
+}
+
+template <typename T>
+inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
+  static_assert(std::is_trivial<T>::value, "T must be trivial type");
+  static_assert(!std::is_same<T, void>::value, "T cannot be void");
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
+    size_t nbytes = size * sizeof(T);
+    void *new_host_mem = new uint8_t[nbytes];
+    std::memcpy(new_host_mem, host_mem, nbytes);
+    AddResetCallbackIfCapturingCUDAGraph(
+        [new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
+    return reinterpret_cast<T *>(new_host_mem);
+  }
+#endif
+  return host_mem;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"

 namespace phi {
 namespace funcs {
@@ -319,7 +319,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
          context.GetPlace(),
          in_num * sizeof(T*),
          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
          inputs_data, in_num);
      paddle::memory::Copy(context.GetPlace(),
                           tmp_dev_ins_data->ptr(),
@@ -368,7 +368,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
          inputs_col_num * sizeof(int64_t),
          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));

-      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
          inputs_col, inputs_col_num);
      paddle::memory::Copy(context.GetPlace(),
                           tmp_dev_ins_col_data->ptr(),
@@ -484,7 +484,7 @@ class SplitFunctor<phi::GPUContext, T> {
          context.GetPlace(),
          o_num * sizeof(T*),
          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
          outputs_data, o_num);
      paddle::memory::Copy(context.GetPlace(),
                           tmp_dev_outs_data->ptr(),
@@ -535,7 +535,7 @@ class SplitFunctor<phi::GPUContext, T> {
              context.GetPlace(),
              outputs_cols_num * sizeof(int64_t),
              phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
          outputs_cols, outputs_cols_num);
      paddle::memory::Copy(context.GetPlace(),
                           tmp_dev_ins_col_data->ptr(),

--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -19,35 +19,29 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <curand_kernel.h>
-
-#include "paddle/fluid/platform/dynload/curand.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
-
-#include "paddle/fluid/platform/dynload/hiprand.h"
 #endif

-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
+
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"

-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {

 template <typename T1, typename T2 = T1, typename OutT = T1>
 struct DstMaskFunctor {
  const float retain_prob_;
  const bool is_upscale_in_train_;
-  using MT = typename details::MPTypeTrait<T1>::Type;
+  using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
  MT factor;
  HOSTDEVICE inline DstMaskFunctor(const float retain_prob,
                                   const bool is_upscale_in_train)
@@ -149,7 +143,7 @@ __global__ void VectorizedRandomGenerator(const size_t n,
 template <typename T1, typename T2 = T1, typename OutT = T1>
 struct MaskFunctor {
  const float retain_prob_;
-  using MT = typename details::MPTypeTrait<T1>::Type;
+  using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
  MT factor;
  HOSTDEVICE inline MaskFunctor(const float retain_prob)
      : retain_prob_(retain_prob) {
@@ -173,7 +167,7 @@ struct MaskFunctor {

 template <typename T, typename MaskType>
 struct DstFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
  MT factor;
  HOSTDEVICE inline DstFunctor(const float retain_prob,
                               const bool is_upscale_in_train,
@@ -271,7 +265,7 @@ inline void CalcBroadcastedMask(const phi::GPUContext& dev_ctx,
                                phi::DenseTensor* broadcasted_mask) {
  // The broadcast of mask can be combined to the following ElementwiseKernel
  // when the BroadcastKernel supports different input types.
-  broadcasted_mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<uint8_t>(broadcasted_mask);

  std::vector<const phi::DenseTensor*> ins = {&mask};
  std::vector<phi::DenseTensor*> outs = {broadcasted_mask};
@@ -337,7 +331,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
    size_t block_size = gpu_config.GetBlockSize();

    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-    const auto& prop = platform::GetDeviceProperties(device_id);
+    const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
                           prop.multiProcessorCount / block_size;
    grid_size = std::min(grid_size, max_grid_size);
@@ -393,9 +387,9 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
  } else {
    if (upscale_in_train) {
      // y = x
-      framework::TensorCopy(x, dev_ctx.GetPlace(), dev_ctx, y);
+      phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, y);
    } else {
-      using MT = typename details::MPTypeTrait<T>::Type;
+      using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
      MT factor = static_cast<MT>(1.0f - dropout_prob);
      // y = factor * x
      ScaleByDropoutFactor<T, MT>(dev_ctx, x, y, factor);
@@ -405,7 +399,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,

 template <typename T, typename MaskType>
 struct CudaDropoutGradFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;

  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}

@@ -428,7 +422,7 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                const phi::DenseTensor& mask,
                                phi::DenseTensor* grad_x,
                                bool is_dropout_nd = false) {
-  using MT = typename details::MPTypeTrait<T>::Type;
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;

  auto stream = dev_ctx.stream();
  if (is_test) {
@@ -465,5 +459,5 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
  }
 }

-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -14,11 +14,13 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/generator.h"
+#include "paddle/phi/core/tensor_utils.h"

-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {

 inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                    const phi::DenseTensor* seed,
@@ -27,13 +29,11 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                    const int offset,
                                    uint64_t* seed_data,
                                    uint64_t* increment) {
-  int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
+  auto gen_cuda = dev_ctx.GetGenerator();

  if (seed) {
    phi::DenseTensor seed_cpu_tensor;
-    paddle::framework::TensorCopySync(
-        *seed, platform::CPUPlace(), &seed_cpu_tensor);
+    phi::Copy(dev_ctx, *seed, phi::CPUPlace(), true, &seed_cpu_tensor);
    *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
    *increment = offset;
  } else if (!is_fix_seed) {
@@ -46,5 +46,5 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
  }
 }

-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/dropout_grad_kernel.h"

-#include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"

 namespace phi {

@@ -30,14 +30,14 @@ void DropoutGradRawKernel(const Context& dev_ctx,
                          DenseTensor* x_grad) {
  bool upscale_in_train = (mode == "upscale_in_train");
  dev_ctx.template Alloc<T>(x_grad);
-  paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
-                                                   is_test,
-                                                   p.to<float>(),
-                                                   upscale_in_train,
-                                                   out_grad,
-                                                   mask,
-                                                   x_grad,
-                                                   false);
+  phi::funcs::DropoutGradGPUKernelDriver<T>(dev_ctx,
+                                            is_test,
+                                            p.to<float>(),
+                                            upscale_in_train,
+                                            out_grad,
+                                            mask,
+                                            x_grad,
+                                            false);
 }

 template <typename T, typename Context>
@@ -51,14 +51,14 @@ void DropoutNdGradKernel(const Context& dev_ctx,
                         DenseTensor* x_grad) {
  bool upscale_in_train = (mode == "upscale_in_train");
  dev_ctx.template Alloc<T>(x_grad);
-  paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
-                                                   is_test,
-                                                   p.to<float>(),
-                                                   upscale_in_train,
-                                                   out_grad,
-                                                   mask,
-                                                   x_grad,
-                                                   true);
+  phi::funcs::DropoutGradGPUKernelDriver<T>(dev_ctx,
+                                            is_test,
+                                            p.to<float>(),
+                                            upscale_in_train,
+                                            out_grad,
+                                            mask,
+                                            x_grad,
+                                            true);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/dropout_kernel.h"

-#include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"

 namespace phi {

@@ -36,17 +36,17 @@ void DropoutRawKernel(const Context& dev_ctx,
  if (mask) {
    dev_ctx.template Alloc<uint8_t>(mask);
  }
-  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
-                                                 is_test,
-                                                 p.to<float>(),
-                                                 upscale_in_train,
-                                                 fix_seed,
-                                                 seed,
-                                                 x,
-                                                 seed_tensor.get_ptr(),
-                                                 mask,
-                                                 out,
-                                                 false);
+  phi::funcs::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                          is_test,
+                                          p.to<float>(),
+                                          upscale_in_train,
+                                          fix_seed,
+                                          seed,
+                                          x,
+                                          seed_tensor.get_ptr(),
+                                          mask,
+                                          out,
+                                          false);
 }

 template <typename T, typename Context>
@@ -66,17 +66,17 @@ void DropoutNdKernel(const Context& dev_ctx,
  if (mask) {
    dev_ctx.template Alloc<uint8_t>(mask);
  }
-  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
-                                                 is_test,
-                                                 p.to<float>(),
-                                                 upscale_in_train,
-                                                 fix_seed,
-                                                 seed,
-                                                 x,
-                                                 seed_tensor.get_ptr(),
-                                                 mask,
-                                                 out,
-                                                 true);
+  phi::funcs::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                          is_test,
+                                          p.to<float>(),
+                                          upscale_in_train,
+                                          fix_seed,
+                                          seed,
+                                          x,
+                                          seed_tensor.get_ptr(),
+                                          mask,
+                                          out,
+                                          true);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"