make cuda graph thread local allocator (#37814)

62b1f38c · sneaxiy · GitHub · c732c831 · 62b1f38c · 62b1f38c
4 changed file
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
  const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
      auto id = platform::CUDAGraph::CapturingID();
      auto iter = cuda_graph_allocator_map_.find(id);
      PADDLE_ENFORCE_NE(
          iter, cuda_graph_allocator_map_.end(),
          platform::errors::PermissionDenied(
              "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
      return iter->second->allocators_;
    } else {
      return allocators_;
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_HIP)
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif
 #if defined(PADDLE_WITH_CUDA)

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace platform {
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
 void CUDAGraph::Reset() {
  if (is_reset_) return;
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
      IsCapturing(), true,
      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                               "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
+                      platform::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
      capturing_graph_->stream_, capturing_graph_->capture_mode_));
  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
  capturing_graph_->place_ = place;
  capturing_graph_->stream_ = stream;
  capturing_graph_->capture_mode_ = mode;
+  if (mode == cudaStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
  BeginSegmentCapture();
 #endif
 }
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
 std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
  EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
  return std::move(capturing_graph_);
 }

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
@@ -26,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/utils/optional.h"
 namespace paddle {
 namespace platform {
@@ -99,6 +101,25 @@ class CUDAGraph {
  // supported during capturing CUDA Graph.
  static bool IsValidCapturing();
+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
 private:
  static CUDAGraphID UniqueID() {
    static std::atomic<CUDAGraphID> id;
@@ -118,6 +139,7 @@ class CUDAGraph {
  bool is_reset_{false};
  std::mutex mtx_;
+  static paddle::optional<std::thread::id> capturing_thread_id_;
  static std::unique_ptr<CUDAGraph> capturing_graph_;
 };

--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
            paddle.set_flags({
                'FLAGS_allocator_strategy': 'auto_growth',
                'FLAGS_sync_nccl_allreduce': False,
-                'FLAGS_cudnn_deterministic': True
+                'FLAGS_cudnn_deterministic': True,
+                'FLAGS_use_stream_safe_cuda_allocator': False,
            })
    def random_tensor(self, shape):
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
        finally:
            graph.reset()
+    def test_dataloader(self):
+        if not can_use_cuda_graph():
+            return
+        class AutoIncDataset(paddle.io.Dataset):
+            def __init__(self, n, dtype):
+                self.n = n
+                self.dtype = dtype
+            def __len__(self):
+                return self.n
+            def __getitem__(self, idx):
+                return np.array([idx]).astype(self.dtype)
+        n = 100
+        dtype = 'int64'
+        dataset = AutoIncDataset(n, dtype)
+        data_loader = paddle.io.DataLoader(
+            dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
+        x = None
+        y = None
+        graph = None
+        for i, data in enumerate(data_loader):
+            if graph is None:
+                x = data
+                x = x.cuda()
+                graph = CUDAGraph()
+                graph.capture_begin()
+                y = x * x
+                graph.capture_end()
+            else:
+                x.copy_(data, False)
+                x = x.cuda()
+            graph.replay()
+            actual_x = np.array([[i]]).astype(dtype)
+            actual_y = np.array([[i * i]]).astype(dtype)
+            self.assertTrue(np.array_equal(actual_x, x.numpy()))
+            self.assertTrue(np.array_equal(actual_y, y.numpy()))
 if __name__ == "__main__":
    unittest.main()