Merge pull request #3534 from gangliao/mem_release

FIX: Release CPU/GPU memory at the end of the Program

Merge pull request #3534 from gangliao/mem_release
FIX: Release CPU/GPU memory at the end of the Program
812a64c0 · gangliao · GitHub · 47f380bb · 94b58a29 · 812a64c0
4 changed file
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
 namespace paddle {
 namespace memory {

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -13,22 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/memory/memory.h"
-#include "paddle/memory/detail/buddy_allocator.h"
-#include "paddle/memory/detail/system_allocator.h"
+#include <algorithm>  // for transform
 #include <cstring>    // for memcpy
+#include <mutex>      // for call_once
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
 namespace paddle {
 namespace memory {
-detail::BuddyAllocator* GetCPUBuddyAllocator() {
+using BuddyAllocator = detail::BuddyAllocator;
-  static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
+std::once_flag cpu_allocator_flag;
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+std::once_flag gpu_allocator_flag;
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::unique_ptr<BuddyAllocator> a{nullptr};
+  std::call_once(cpu_allocator_flag, [&]() {
+    a.reset(new BuddyAllocator(new detail::CPUAllocator,
                               platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
+                               platform::CpuMaxChunkSize()));
-  }
+  });
-  return a;
+  return a.get();
 }
 template <>
@@ -48,20 +58,31 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifndef PADDLE_ONLY_CPU
-detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static detail::BuddyAllocator** as = NULL;
+  using BuddyAllocVec = std::vector<BuddyAllocator*>;
-  if (as == NULL) {
+  static std::unique_ptr<BuddyAllocVec, void (*)(BuddyAllocVec * p)> as{
+      new BuddyAllocVec, [](BuddyAllocVec* p) {
+        std::for_each(p->begin(), p->end(),
+                      [](BuddyAllocator* p) { delete p; });
+      }};
+  // GPU buddy allocators
+  auto& allocators = *as.get();
+  // GPU buddy allocator initialization
+  std::call_once(gpu_allocator_flag, [&]() {
    int gpu_num = platform::GetDeviceCount();
-    as = new detail::BuddyAllocator*[gpu_num];
+    allocators.reserve(gpu_num);
    for (int gpu = 0; gpu < gpu_num; gpu++) {
      platform::SetDeviceId(gpu);
-      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+      allocators.emplace_back(new BuddyAllocator(new detail::GPUAllocator,
                                                 platform::GpuMinChunkSize(),
-                                           platform::GpuMaxChunkSize());
+                                                 platform::GpuMaxChunkSize()));
-    }
    }
+  });
  platform::SetDeviceId(gpu_id);
-  return as[gpu_id];
+  return allocators[gpu_id];
 }
 template <>

--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -45,4 +45,8 @@ TEST(Gather, GatherData) {
  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+  delete src;
+  delete index;
+  delete output;
 }
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -49,4 +49,8 @@ TEST(scatter, ScatterUpdate) {
    EXPECT_EQ(output->data<float>()[i], float(i - 4));
  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  delete src;
+  delete index;
+  delete output;
 }