FIX: Improve fallback gpu allocator

169022d0 · liaogang · b29923f9 · 169022d0 · 169022d0 · 169022d0
7 changed file
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_test(system_allocator_test
+    SRCS system_allocator_test.cc
+    DEPS system_allocator gpu_info gflags)
 else(${WITH_GPU})
  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)

--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,32 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"

 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock

 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"

 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");

 namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;

+  if (FLAGS_use_pinned_memory) {
+    void* p = malloc(size);
+    if (p != nullptr) {
+      mlock(p, size);
+    }
+  }
+
  void* p = malloc(size);
  if (p != nullptr && FLAGS_use_pinned_memory) {
    mlock(p, size);
@@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) {
  return p;
 }

-void CPUAllocator::Free(void* p, size_t size) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
  if (p != nullptr && FLAGS_use_pinned_memory) {
    munlock(p, size);
  }
@@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) {

 #ifndef PADDLE_ONLY_CPU

-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;

+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t remaining = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
  void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= remaining) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      total_alloc_size_ += size;
+      return p;
+    }
  }
-  return result == cudaSuccess ? p : nullptr;
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    total_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
 }

-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
  // Purposefully allow cudaErrorCudartUnloading, because
  // that is returned if you ever call cudaFree after the
  // driver has already shutdown. This happens only if the
  // process is terminating, in which case we don't care if
  // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  PADDLE_ASSERT(total_alloc_size_ >= size);
+  total_alloc_size_ -= size;
+  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
  if (err != cudaErrorCudartUnloading) {
    platform::throw_on_error(err, "cudaFree{Host} failed");
  }

--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -30,21 +30,24 @@ namespace detail {
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
 };

 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
 };

 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+
+ private:
+  size_t total_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU


--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
  bool freed = false;
  {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }

 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
  paddle::memory::detail::GPUAllocator a;
  TestAllocator(a, 2048);
  TestAllocator(a, 0);

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
+cc_library(cpu_info SRCS cpu_info.cc)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+
+nv_library(gpu_info SRCS gpu_info.cc)
 nv_test(cuda_test SRCS cuda_test.cu)

 cc_library(place SRCS place.cc)

--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
+#include "paddle/platform/cpu_info.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
+  ss << std::to_string(
+            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
+     << "% of CPU Memory Usage: " << mem_size << " GB";
+  std::cout << ss.str();
+}
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace paddle {
-namespace platform {
-
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU