Print memory peak message for UT (#42092)

* Add peak memory log for CI * Change VLOG to std::cout * Move print code to test_runner.py and paddle_gtest_main.cc * Fix typo * Fix conflicts * Updata message format * Fix CI errors * Add FLAGS_enable_gpu_memory_usage_log * Fix CI errors

Print memory peak message for UT (#42092)
* Add peak memory log for CI * Change VLOG to std::cout * Move print code to test_runner.py and paddle_gtest_main.cc * Fix typo * Fix conflicts * Updata message format * Fix CI errors * Add FLAGS_enable_gpu_memory_usage_log * Fix CI errors
28375ca4 · Ruibiao Chen · GitHub · e8e3b997 · 28375ca4 · 28375ca4
6 changed file
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -107,7 +107,7 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
    break

 #define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  do {                                                                \
+  [&] {                                                               \
    paddle::memory::StatBase* stat = nullptr;                         \
    switch (id) {                                                     \
      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
@@ -133,8 +133,8 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
            id));                                                     \
        break;                                                        \
    }                                                                 \
-    stat->func(__VA_ARGS__);                                          \
-  } while (0)
+    return stat->func(__VA_ARGS__);                                   \
+  }()

 #define MEMORY_STAT_CURRENT_VALUE(item, id) \
  MEMORY_STAT_FUNC(item, id, GetCurrentValue)

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
@@ -49,6 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);

+#ifdef PADDLE_WITH_TESTING
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+                            "Whether to print the message of gpu memory usage "
+                            "at exit, mainly used for UT and CI.");
+#endif
+
 constexpr static float fraction_reserve_gpu_memory = 0.05f;

 USE_GPU_MEM_STAT;
@@ -137,12 +144,31 @@ class RecordedGpuMallocHelper {
    if (NeedRecord()) {
      mtx_.reset(new std::mutex());
    }
+
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      // A fake UPDATE to trigger the construction of memory stat instances,
+      // make sure that they are destructed after RecordedGpuMallocHelper.
+      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+    }
+#endif
  }

  DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);

 public:
+  ~RecordedGpuMallocHelper() {
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
+                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+    }
+#endif
+  }
+
  static RecordedGpuMallocHelper *Instance(int dev_id) {
+    static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+
    std::call_once(once_flag_, [] {
      int dev_cnt = GetGPUDeviceCount();
      instances_.reserve(dev_cnt);
@@ -326,14 +352,11 @@ class RecordedGpuMallocHelper {
  mutable std::unique_ptr<std::mutex> mtx_;

  static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;

  std::set<void *> gpu_ptrs;  // just for testing
 };                            // NOLINT

 std::once_flag RecordedGpuMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
-    RecordedGpuMallocHelper::instances_;

 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
                             bool malloc_managed_memory) {

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -106,9 +106,6 @@ namespace phi {
 class ErrorSummary;
 }  // namespace phi

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
 DECLARE_int32(call_stack_level);

 namespace paddle {
@@ -539,7 +536,7 @@ inline void retry_sleep(unsigned milliseconds) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
+      paddle::platform::retry_sleep(10000);                             \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \
@@ -727,7 +724,7 @@ inline void retry_sleep(unsigned millisecond) {
        ::paddle::platform::details::ExternalApiType<                   \
            __CUDA_STATUS_TYPE__>::kSuccess;                            \
    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
+      ::paddle::platform::retry_sleep(10000);                           \
      __cond__ = (COND);                                                \
      ++retry_count;                                                    \
    }                                                                   \

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
 # for paddle test case

 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils)
+  set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc)
+
+  if (WITH_GPU OR WITH_ROCM)
+    list(APPEND paddle_gtest_main_deps gpu_info)
+  endif()
+
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps})
 endif()
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/init.h"

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(enable_gpu_memory_usage_log);
+#endif
+
 int main(int argc, char** argv) {
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
  testing::InitGoogleTest(&argc, argv);
@@ -81,6 +85,13 @@ int main(int argc, char** argv) {
    VLOG(1) << "gtest undefok_string:" << undefok_string;
  }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
+    VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
+    FLAGS_enable_gpu_memory_usage_log = true;
+  }
+#endif
+
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();
  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(

--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -20,6 +20,7 @@ import sys
 import paddle
 import paddle.fluid as fluid
 import importlib
+import paddle.fluid.core as core
 from six.moves import cStringIO

 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
@@ -28,6 +29,10 @@ import static_mode_white_list

 def main():
    sys.path.append(os.getcwd())
+    if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
+        if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
+            os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+
    some_test_failed = False
    for module_name in sys.argv[1:]:
        flag_need_static_mode = False
@@ -45,6 +50,7 @@ def main():
                    module = importlib.import_module(module_name)
                    tests = test_loader.loadTestsFromModule(module)
                    res = unittest.TextTestRunner(stream=buffer).run(tests)
+
                    if not res.wasSuccessful():
                        some_test_failed = True
                        print(