fix cuda dev_ctx allocator cmake deps, test=develop (#19953)

37f76407 · Zeng Jinle · GitHub · ebff68fa · 37f76407 · 37f76407
12 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -17,8 +17,6 @@ cc_library(memory
        memcpy)
 if (WITH_GPU)
-    add_dependencies(malloc cuda_device_context_allocator_pool)
-    target_link_libraries(malloc cuda_device_context_allocator_pool)
    nv_test(malloc_test
            SRCS malloc_test.cu
            DEPS device_context malloc)

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -14,12 +14,6 @@ endif()
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-  nv_library(cuda_device_context_allocation SRCS cuda_device_context_allocation.cc
-      DEPS allocator enforce place ${MKLDNN_CTX_DEPS})
-  nv_library(cuda_device_context_allocator SRCS cuda_device_context_allocator.cc 
-      DEPS allocator enforce place cuda_device_context_allocation ${MKLDNN_CTX_DEPS})
-  nv_library(cuda_device_context_allocator_pool SRCS cuda_device_context_allocator_pool.cc
-      DEPS allocator enforce place cuda_device_context_allocation cuda_device_context_allocator ${MKLDNN_CTX_DEPS})
 endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocation.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocation.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
-#include <utility>
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-CUDADeviceContextAllocation::CUDADeviceContextAllocation(
-    AllocationPtr allocation)
-    : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
-      underlying_allocation_(std::move(allocation)) {}
-CUDADeviceContextAllocation::~CUDADeviceContextAllocation() {
-  PADDLE_ENFORCE_NOT_NULL(
-      dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
-  auto *p_allocation = underlying_allocation_.release();
-  VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
-          << p_allocation;
-  dev_ctx_->AddStreamCallback([p_allocation] {
-    VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-    AllocationDeleter()(p_allocation);
-  });
-}
-void CUDADeviceContextAllocation::SetCUDADeviceContext(
-    const platform::CUDADeviceContext *dev_ctx) {
-  dev_ctx_ = dev_ctx;
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocation.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocation.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/device_context.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-/**
- * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
- * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
- * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
- * which deletes allocation in the callback.
- */
-class CUDADeviceContextAllocation : public Allocation {
- public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation);
-  ~CUDADeviceContextAllocation();
-  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx);
- private:
-  AllocationPtr underlying_allocation_;
-  const platform::CUDADeviceContext *dev_ctx_{nullptr};
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-CUDADeviceContextAllocator::CUDADeviceContextAllocator(
-    const platform::CUDAPlace place, cudaStream_t default_stream)
-    : place_(place), default_stream_(default_stream) {
-  platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaEventCreate(&event_, cudaEventDisableTiming),
-      "Create event failed in CUDADeviceContextAllocator");
-}
-CUDADeviceContextAllocator::~CUDADeviceContextAllocator() {
-  if (event_) {
-    platform::CUDADeviceGuard guard(place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventDestroy(event_),
-        "Destory event failed in CUDADeviceContextAllocator destroctor");
-  }
-}
-Allocation *CUDADeviceContextAllocator::AllocateImpl(size_t size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      default_stream_,
-      "Didn't set default stream for CUDADeviceContextAllocator");
-  platform::CUDADeviceGuard guard(place_.device);
-  auto allocation =
-      new CUDADeviceContextAllocation(memory::Alloc(place_, size));
-  // Wait for the event on stream
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaEventRecord(event_, default_stream_),
-      "Failed to record event in CUDADeviceContextAllocator");
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaStreamWaitEvent(default_stream_, event_, 0),
-      "Failed to wait event in CUDADeviceContextAllocator");
-  return allocation;
-}
-void CUDADeviceContextAllocator::FreeImpl(Allocation *allocation) {
-  delete allocation;
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -15,15 +15,58 @@
 #pragma once
 #include <cuda_runtime.h>
+#include <map>
+#include <memory>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
 namespace memory {
 namespace allocation {
+/**
+ * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
+ * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
+ * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
+ * which deletes allocation in the callback.
+ */
+class CUDADeviceContextAllocation : public Allocation {
+ public:
+  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
+      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+        underlying_allocation_(std::move(allocation)) {}
+  ~CUDADeviceContextAllocation() {
+    PADDLE_ENFORCE_NOT_NULL(
+        dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
+    auto *p_allocation = underlying_allocation_.release();
+    VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
+            << p_allocation;
+    dev_ctx_->AddStreamCallback([p_allocation] {
+      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
+      AllocationDeleter()(p_allocation);
+    });
+  }
+  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
+    dev_ctx_ = dev_ctx;
+  }
+ private:
+  AllocationPtr underlying_allocation_;
+  const platform::CUDADeviceContext *dev_ctx_{nullptr};
+};
 /**
 * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
 * after waiting for a self-created event on the default stream. It does so to
@@ -33,12 +76,42 @@ namespace allocation {
 class CUDADeviceContextAllocator : public Allocator {
 public:
  explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
-                                      cudaStream_t default_stream);
+                                      cudaStream_t default_stream)
-  ~CUDADeviceContextAllocator();
+      : place_(place), default_stream_(default_stream) {
+    platform::CUDADeviceGuard guard(place_.device);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventCreate(&event_, cudaEventDisableTiming),
+        "Create event failed in CUDADeviceContextAllocator");
+  }
+  ~CUDADeviceContextAllocator() {
+    if (event_) {
+      platform::CUDADeviceGuard guard(place_.device);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventDestroy(event_),
+          "Destory event failed in CUDADeviceContextAllocator destroctor");
+    }
+  }
 protected:
-  Allocation *AllocateImpl(size_t size) override;
+  Allocation *AllocateImpl(size_t size) override {
-  void FreeImpl(Allocation *allocation) override;
+    PADDLE_ENFORCE_NOT_NULL(
+        default_stream_,
+        "Didn't set default stream for CUDADeviceContextAllocator");
+    platform::CUDADeviceGuard guard(place_.device);
+    auto allocation =
+        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
+    // Wait for the event on stream
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(event_, default_stream_),
+        "Failed to record event in CUDADeviceContextAllocator");
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(default_stream_, event_, 0),
+        "Failed to wait event in CUDADeviceContextAllocator");
+    return allocation;
+  }
+  void FreeImpl(Allocation *allocation) override { delete allocation; }
 private:
  platform::CUDAPlace place_;
@@ -46,6 +119,49 @@ class CUDADeviceContextAllocator : public Allocator {
  cudaStream_t default_stream_{nullptr};
 };
+/**
+ * CUDADeviceContextAllocatorPool is a singletion stores mapping from
+ * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
+ * CUDADeviceContext's compute stream isn't default stream, it can call this
+ * class to allocate GPU memory which will be released by a callback after
+ * stream execution.
+ */
+class CUDADeviceContextAllocatorPool {
+ public:
+  static CUDADeviceContextAllocatorPool &Instance() {
+    static CUDADeviceContextAllocatorPool pool;
+    return pool;
+  }
+  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
+    auto iter =
+        allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
+    PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
+                      "CUDADeviceContextAllocatorPool initialization error");
+    auto &allocator = iter->second;
+    AllocationPtr allocation = allocator->Allocate(size);
+    static_cast<CUDADeviceContextAllocation *>(allocation.get())
+        ->SetCUDADeviceContext(&dev_ctx);
+    return allocation;
+  }
+ private:
+  CUDADeviceContextAllocatorPool() {
+    std::vector<int> devices = platform::GetSelectedDevices();
+    for (int i : devices) {
+      auto place = platform::CUDAPlace(i);
+      auto compute_stream =
+          platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
+      auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
+          new CUDADeviceContextAllocator(place, compute_stream));
+      allocators_.insert(make_pair(place, allocator));
+    }
+  }
+  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
+      allocators_;
+};
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
-#include <utility>
-#include <vector>
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-CUDADeviceContextAllocatorPool &CUDADeviceContextAllocatorPool::Instance() {
-  static CUDADeviceContextAllocatorPool pool;
-  return pool;
-}
-AllocationPtr CUDADeviceContextAllocatorPool::Alloc(
-    const platform::CUDADeviceContext &dev_ctx, size_t size) {
-  auto iter =
-      allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
-  PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
-                    "CUDADeviceContextAllocatorPool initialization error");
-  auto &allocator = iter->second;
-  AllocationPtr allocation = allocator->Allocate(size);
-  static_cast<CUDADeviceContextAllocation *>(allocation.get())
-      ->SetCUDADeviceContext(&dev_ctx);
-  return allocation;
-}
-CUDADeviceContextAllocatorPool::CUDADeviceContextAllocatorPool() {
-  std::vector<int> devices = platform::GetSelectedDevices();
-  for (int i : devices) {
-    auto place = platform::CUDAPlace(i);
-    auto compute_stream =
-        platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
-    auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
-        new CUDADeviceContextAllocator(place, compute_stream));
-    allocators_.insert(make_pair(place, allocator));
-  }
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <map>
-#include <memory>
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-/**
- * CUDADeviceContextAllocatorPool is a singletion stores mapping from
- * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
- * CUDADeviceContext's compute stream isn't default stream, it can call this
- * class to allocate GPU memory which will be released by a callback after
- * stream execution.
- */
-class CUDADeviceContextAllocatorPool {
- public:
-  static CUDADeviceContextAllocatorPool &Instance();
-  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size);
- private:
-  CUDADeviceContextAllocatorPool();
-  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
-      allocators_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -17,10 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
-#endif
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
@@ -35,26 +31,5 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
  return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
-AllocationPtr Alloc(const platform::DeviceContext &dev_ctx, size_t size) {
-  auto place = dev_ctx.GetPlace();
-#ifdef PADDLE_WITH_CUDA
-  if (size == 0 || !platform::is_gpu_place(place)) {
-    return Alloc(place, size);
-  }
-  auto *default_dev_ctx = static_cast<platform::CUDADeviceContext *>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto &desired_dev_ctx =
-      static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-  if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-    return Alloc(place, size);
-  } else {
-    return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
-        desired_dev_ctx, size);
-  }
-#else
-  return Alloc(place, size);
-#endif
-}
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -70,7 +70,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps})

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -18,11 +18,39 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "glog/logging.h"
+namespace paddle {
+namespace memory {
+AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
+  auto place = dev_ctx.GetPlace();
+#ifdef PADDLE_WITH_CUDA
+  if (size == 0 || !platform::is_gpu_place(place)) {
+    return Alloc(place, size);
+  }
+  auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
+      platform::DeviceContextPool::Instance().Get(place));
+  auto& desired_dev_ctx =
+      static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+  if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
+    return Alloc(place, size);
+  } else {
+    return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
+        desired_dev_ctx, size);
+  }
+#else
+  return Alloc(place, size);
+#endif
+}
+}  // namespace memory
+}  // namespace paddle
 namespace paddle {
 namespace platform {
@@ -174,6 +202,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
 };
+void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
+  if (required_workspace_bytes <= WorkspaceSize()) {
+    return;
+  }
+  // reset allocation first before re-allocate to save memory
+  allocation_.reset();
+  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
+}
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  CUDADeviceGuard guard(place_.device);
  compute_capability_ = GetCUDAComputeCapability(place_.device);

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -220,14 +220,7 @@ class CudnnWorkspaceHandle {
    ResetWorkspace();
  }
-  inline void ReallocWorkspace(size_t required_workspace_bytes) {
+  void ReallocWorkspace(size_t required_workspace_bytes);
-    if (required_workspace_bytes <= WorkspaceSize()) {
-      return;
-    }
-    // reset allocation first before re-allocate to save memory
-    allocation_.reset();
-    allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
-  }
  inline void ResetWorkspace() { allocation_ = nullptr; }