add support to old allocator

e2780623 · sneaxiy · a5cf565c · e2780623 · e2780623 · e2780623
4 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
 add_subdirectory(detail)
 add_subdirectory(allocation)
-cc_library(malloc SRCS malloc.cc DEPS allocator_facade)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce allocator_facade)
 cc_library(memcpy SRCS memcpy.cc DEPS place)

 cc_library(memory

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"

+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
            "BuddyAllocator are always zeroed in some op's implementation. "
@@ -26,17 +30,262 @@ DEFINE_bool(init_allocated_mem, false,
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);

+DEFINE_bool(use_legacy_allocator, true,
+            "Whether to use the legacy allocator. If the new allocators have"
+            "been well tested, we should remove these flag.");
+
 namespace paddle {
 namespace memory {

+namespace legacy {
+
+using BuddyAllocator = detail::BuddyAllocator;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  // We tried thread_local for inference::RNN1 model, but that not works much
+  // for multi-thread test.
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator* a = nullptr;
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
+  return a;
+}
+
+// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
+// seems they are almost the same overhead.
+struct NaiveAllocator {
+  void* Alloc(size_t size) { return malloc(size); }
+
+  void Free(void* p) {
+    PADDLE_ENFORCE(p);
+    free(p);
+  }
+
+  static NaiveAllocator* Instance() {
+    static NaiveAllocator x;
+    return &x;
+  }
+
+ private:
+  std::mutex lock_;
+};
+
+template <>
+void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+
+template <>
+void Free<platform::CPUPlace>(const platform::CPUPlace& place, void* p) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
+    int gpu_num = platform::GetCUDADeviceCount();
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
+    }
+  });
+
+  platform::SetDeviceId(gpu_id);
+  return a_arr[gpu_id];
+}
+
+template <>
+size_t Used<platform::CUDAPlace>(const platform::CUDAPlace& place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
+                                 size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(&avail, &total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPlace>(const platform::CUDAPlace& place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
+                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize());
+  });
+
+  return ba;
+}
+
+template <>
+size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place) {
+  return GetCUDAPinnedBuddyAllocator()->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
+                                       size_t size) {
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
+  void* ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
+                 << " bytes in CUDAPinnedPlace";
+  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
+                                     void* p) {
+  GetCUDAPinnedBuddyAllocator()->Free(p);
+}
+#endif
+
+struct AllocVisitor : public boost::static_visitor<void*> {
+  inline explicit AllocVisitor(size_t size) : size_(size) {}
+
+  template <typename Place>
+  inline void* operator()(const Place& place) const {
+    return Alloc<Place>(place, size_);
+  }
+
+ private:
+  size_t size_;
+};
+
+struct FreeVisitor : public boost::static_visitor<void> {
+  inline explicit FreeVisitor(void* ptr) : ptr_(ptr) {}
+
+  template <typename Place>
+  inline void operator()(const Place& place) const {
+    Free<Place>(place, ptr_);
+  }
+
+ private:
+  void* ptr_;
+};
+
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(cuda_pinned);
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
+class LegacyAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+
+  ~LegacyAllocation() {
+    boost::apply_visitor(FreeVisitor(this->ptr()), this->place());
+  }
+};
+
+}  // namespace legacy
+
 std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                        size_t size, Allocator::Attr attr) {
-  return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr);
+  if (FLAGS_use_legacy_allocator) {
+    void* p = boost::apply_visitor(legacy::AllocVisitor(size), place);
+    return std::shared_ptr<Allocation>(
+        new legacy::LegacyAllocation(p, size, place));
+  } else {
+    return allocation::AllocatorFacade::Instance().AllocShared(place, size,
+                                                               attr);
+  }
 }

 std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
                                  Allocator::Attr attr) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
+  if (FLAGS_use_legacy_allocator) {
+    void* p = boost::apply_visitor(legacy::AllocVisitor(size), place);
+    return std::unique_ptr<Allocation>(
+        new legacy::LegacyAllocation(p, size, place));
+  } else {
+    return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
+  }
 }
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -30,5 +30,26 @@ extern std::unique_ptr<Allocation> Alloc(
    const platform::Place& place, size_t size,
    Allocator::Attr attr = Allocator::kDefault);

+namespace legacy {
+
+template <typename Place>
+void* Alloc(const Place& place, size_t size);
+
+template <typename Place>
+void Free(const Place& place, void* p);
+
+template <typename Place>
+size_t Used(const Place& place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+}  // namespace legacy
+
 }  // namespace memory
 }  // namespace paddle
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,7 +113,7 @@ def __bootstrap__():
        'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
        'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
        'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
-        'eager_delete_tensor_gb'
+        'eager_delete_tensor_gb', 'use_legacy_allocator'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')