From 8164414578a7c1952bf7ee96c2a70b1932c39ca5 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Tue, 10 May 2022 14:52:35 +0800
Subject: [PATCH] [Eager] print gpu mem info (#42616)

* print mem

* refine

* refine

* refine

* refine
---
 paddle/fluid/platform/device/gpu/gpu_info.cc | 24 +++++++++++++-------
 tools/get_ut_mem_map.py                      |  4 ++--
 tools/test_runner.py                         |  1 +
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index eb82389702c..6da5d1244fb 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -50,11 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
-#ifdef PADDLE_WITH_TESTING
 PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
                             "Whether to print the message of gpu memory usage "
                             "at exit, mainly used for UT and CI.");
-#endif
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+                            "Whether to print the message of gpu memory usage "
+                            "MB as a unit of measurement.");
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
@@ -145,25 +146,32 @@ class RecordedGpuMallocHelper {
       mtx_.reset(new std::mutex());
     }
 
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
       // A fake UPDATE to trigger the construction of memory stat instances,
       // make sure that they are destructed after RecordedGpuMallocHelper.
       MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
     }
-#endif
   }
 
   DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
 
  public:
   ~RecordedGpuMallocHelper() {
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
-      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
-                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+      if (FLAGS_enable_gpu_memory_usage_log_mb) {
+        std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
+                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
+                  << ", Allocated = "
+                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
+                  << std::endl;
+      } else {
+        std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
+                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << ", Allocated = "
+                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
+      }
     }
-#endif
   }
 
   static RecordedGpuMallocHelper *Instance(int dev_id) {
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
index daf80597d3a..745d7f9a90c 100644
--- a/tools/get_ut_mem_map.py
+++ b/tools/get_ut_mem_map.py
@@ -34,8 +34,8 @@ def get_ut_mem(rootPath):
                 if '[Memory Usage (Byte)] gpu' in line:
                     mem_reserved = round(
                         float(
-                            line.split('[max memory reserved] gpu')[1].split(
-                                ':')[1].split('\\n')[0].strip()), 2)
+                            line.split(' : Reserved = ')[1].split(
+                                ', Allocated = ')[0]), 2)
                     if mem_reserved > mem_reserved1:
                         mem_reserved1 = mem_reserved
                 if 'MAX_GPU_MEMORY_USE=' in line:
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 7ceed18634a..02d926914f9 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -32,6 +32,7 @@ def main():
     if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
         if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
             os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+            os.environ['FLAGS_enable_gpu_memory_usage_log_mb'] = 'false'
 
     some_test_failed = False
     for module_name in sys.argv[1:]:
-- 
GitLab