expose peak gpu memory API to python test=develop (#15529)

* expose peak gpu memory API to python test=develop * add unittest for peak gpu memory monitoring test=develop * add pybind change test=develop * add mutex to gpu mem usage monitor test=develop * update benchmark flag definition file test=develop * tweak unittest for memory monitoring test=develop

expose peak gpu memory API to python test=develop (#15529)
* expose peak gpu memory API to python test=develop * add unittest for peak gpu memory monitoring test=develop * add pybind change test=develop * add mutex to gpu mem usage monitor test=develop * update benchmark flag definition file test=develop * tweak unittest for memory monitoring test=develop
6e84eb13 · liuwei1031 · dzhwinter · 1743d1a5 · 6e84eb13 · 6e84eb13
6 changed file
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -22,11 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
-DEFINE_bool(benchmark, false,
+DECLARE_bool(benchmark);
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
 DEFINE_bool(
    eager_delete_scope, true,

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -35,6 +35,7 @@ DEFINE_bool(init_allocated_mem, false,
            "To find this error in time, we use init_allocated_mem to indicate "
            "that initializing the allocated memory with a small value "
            "during unit testing.");
+DECLARE_bool(benchmark);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
@@ -59,11 +60,6 @@ size_t memory_usage(const platform::Place &p);
 using BuddyAllocator = detail::BuddyAllocator;
-std::unordered_map</*device id*/ int,
-                   std::pair</*current memory usage*/ uint64_t,
-                             /*peak memory usage*/ uint64_t>>
-    gpu_mem_info;
 BuddyAllocator *GetCPUBuddyAllocator() {
  // We tried thread_local for inference::RNN1 model, but that not works much
  // for multi-thread test.
@@ -144,6 +140,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
    devices = platform::GetSelectedDevices();
    int gpu_num = devices.size();
+    allocation::GPUMemMonitor.Initialize(devices.size());
    a_arr = new BuddyAllocator *[gpu_num];
    for (size_t i = 0; i < devices.size(); ++i) {
      int dev_id = devices[i];
@@ -204,12 +202,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
    platform::SetDeviceId(cur_dev);
  } else {
-    gpu_mem_info[place.device].first += size;
+    if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
-    if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
-      gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
-      VLOG(3) << "device: " << place.device << " peak memory usage : "
-              << (gpu_mem_info[place.device].second >> 20) << " MiB";
-    }
    if (FLAGS_init_allocated_mem) {
      cudaMemset(ptr, 0xEF, size);
    }
@@ -225,7 +218,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
-  gpu_mem_info[place.device].first -= size;
+  if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
 #else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -335,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 namespace allocation {
+LegacyMemMonitor GPUMemMonitor;
 Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
  return new Allocation(ptr, size, place_);
@@ -346,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) {
      allocation->place());
  delete allocation;
 }
+bool MemInfo::Add(const size_t &size) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  usage_ += size;
+  bool peak_point = usage_ > peak_usage_;
+  if (peak_point) peak_usage_ = usage_;
+  return peak_point;
+}
+void MemInfo::Minus(const size_t &size) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  usage_ -= size;
+}
+uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
+LegacyMemMonitor::~LegacyMemMonitor() {
+  for (auto &item : gpu_mem_info_) delete item.second;
+}
+void LegacyMemMonitor::Initialize(const int &device_num) {
+  for (auto i = 0; i < device_num; ++i) {
+    gpu_mem_info_[i] = new MemInfo();
+  }
+}
+void LegacyMemMonitor::Add(const int &device, const size_t &size) {
+  if (gpu_mem_info_[device]->Add(size)) {
+    VLOG(3) << "#LegacyMemMonitor# device: " << device
+            << " peak memory usage : "
+            << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB";
+  }
+}
+void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
+  gpu_mem_info_[device]->Minus(size);
+}
+uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
+  return gpu_mem_info_.find(device) == gpu_mem_info_.end()
+             ? 0
+             : gpu_mem_info_[device]->GetPeakUsage();
+}
+void LegacyMemMonitor::PrintMemUsage() {
+  std::vector<int> devices;
+  for (const auto &item : gpu_mem_info_) {
+    devices.emplace_back(item.first);
+  }
+  std::sort(devices.begin(), devices.end());
+  for (const auto &device : devices) {
+    std::cout << "Device : " << device << " Peak Memory Usage : "
+              << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"
+              << std::endl;
+  }
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -13,12 +13,59 @@
 // limitations under the License.
 #pragma once
+#include <algorithm>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
+class MemInfo {
+ public:
+  MemInfo() : usage_(0), peak_usage_(0) {}
+  MemInfo(const MemInfo &) = delete;
+  MemInfo &operator=(const MemInfo &) = delete;
+  // return a flag to indicate current operation will create a peak point or not
+  bool Add(const size_t &);
+  void Minus(const size_t &);
+  uint64_t GetPeakUsage();
+ private:
+  /* current memory usage*/
+  uint64_t usage_;
+  uint64_t peak_usage_;
+  std::mutex mutex_;
+};
+class LegacyMemMonitor {
+ public:
+  // used to store the GPU memory usage of each devices
+  using MemUsage = std::unordered_map</*device id*/ int,
+                                      /*mem usage info node*/ MemInfo *>;
+  MemUsage GetMemUsageInfo() { return gpu_mem_info_; }
+  ~LegacyMemMonitor();
+  void Initialize(const int &);
+  void Add(const int &, const size_t &);
+  void Minus(const int &, const size_t &);
+  uint64_t GetMemUsage(const int &);
+  void PrintMemUsage();
+ protected:
+  MemUsage gpu_mem_info_;
+};
+extern LegacyMemMonitor GPUMemMonitor;
 class LegacyAllocatorPrivate;
 class LegacyAllocator : public Allocator {
 public:

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) {
  m.add_object("_cleanup",
               py::capsule([]() { ScopePool::Instance().Clear(); }));
+  m.def("get_mem_usage", [](int device) {
+    return memory::allocation::GPUMemMonitor.GetMemUsage(device);
+  });
+  m.def("print_mem_usage",
+        []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
  py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
      // .def(py::init<>())
      .def(py::init<bool>(), py::arg("stop_gradient") = false)

--- a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import os
+os.environ['FLAGS_benchmark'] = 'True'
+import numpy
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import mul, data
+class TestPeakMemoryMonitoring(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], dtype='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            dtype='float32',
+            append_batch_size=False)
+        out = mul(x=a, y=b)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            a_np = numpy.random.random((100, 784)).astype('float32')
+            b_np = numpy.random.random((784, 100)).astype('float32')
+            self.assertEqual(0, core.get_mem_usage(0))
+            exe = Executor(place)
+            outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+            out = outs[0]
+            #disable this assert since ctest will ignore the os.environ setting 
+            #self.assertGreater(core.get_mem_usage(0), 0)
+            raised = False
+            try:
+                core.print_mem_usage()
+            except:
+                raised = True
+            self.assertFalse(raised, 'Exception raised')
+if __name__ == '__main__':
+    unittest.main()