From 28375ca4625067ebd72b39c6b8913127268a3a42 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Thu, 5 May 2022 14:41:36 +0800 Subject: [PATCH] Print memory peak message for UT (#42092) * Add peak memory log for CI * Change VLOG to std::cout * Move print code to test_runner.py and paddle_gtest_main.cc * Fix typo * Fix conflicts * Updata message format * Fix CI errors * Add FLAGS_enable_gpu_memory_usage_log * Fix CI errors --- paddle/fluid/memory/stats.h | 6 ++-- paddle/fluid/platform/device/gpu/gpu_info.cc | 29 ++++++++++++++++++-- paddle/fluid/platform/enforce.h | 7 ++--- paddle/testing/CMakeLists.txt | 8 +++++- paddle/testing/paddle_gtest_main.cc | 11 ++++++++ tools/test_runner.py | 6 ++++ 6 files changed, 55 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h index f644d2f587..0906567dbf 100644 --- a/paddle/fluid/memory/stats.h +++ b/paddle/fluid/memory/stats.h @@ -107,7 +107,7 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); break #define MEMORY_STAT_FUNC(item, id, func, ...) \ - do { \ + [&] { \ paddle::memory::StatBase* stat = nullptr; \ switch (id) { \ MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \ @@ -133,8 +133,8 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); id)); \ break; \ } \ - stat->func(__VA_ARGS__); \ - } while (0) + return stat->func(__VA_ARGS__); \ + }() #define MEMORY_STAT_CURRENT_VALUE(item, id) \ MEMORY_STAT_FUNC(item, id, GetCurrentValue) diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 89e3b74bb3..eb82389702 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/monitor.h" @@ -49,6 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_uint64(gpu_memory_limit_mb); +#ifdef PADDLE_WITH_TESTING +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, + "Whether to print the message of gpu memory usage " + "at exit, mainly used for UT and CI."); +#endif + constexpr static float fraction_reserve_gpu_memory = 0.05f; USE_GPU_MEM_STAT; @@ -137,12 +144,31 @@ class RecordedGpuMallocHelper { if (NeedRecord()) { mtx_.reset(new std::mutex()); } + +#ifdef PADDLE_WITH_TESTING + if (FLAGS_enable_gpu_memory_usage_log) { + // A fake UPDATE to trigger the construction of memory stat instances, + // make sure that they are destructed after RecordedGpuMallocHelper. + MEMORY_STAT_UPDATE(Reserved, dev_id, 0); + } +#endif } DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper); public: + ~RecordedGpuMallocHelper() { +#ifdef PADDLE_WITH_TESTING + if (FLAGS_enable_gpu_memory_usage_log) { + std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : " + << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl; + } +#endif + } + static RecordedGpuMallocHelper *Instance(int dev_id) { + static std::vector> instances_; + std::call_once(once_flag_, [] { int dev_cnt = GetGPUDeviceCount(); instances_.reserve(dev_cnt); @@ -326,14 +352,11 @@ class RecordedGpuMallocHelper { mutable std::unique_ptr mtx_; static std::once_flag once_flag_; - static std::vector> instances_; std::set gpu_ptrs; // just for testing }; // NOLINT std::once_flag RecordedGpuMallocHelper::once_flag_; -std::vector> - RecordedGpuMallocHelper::instances_; gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id, bool malloc_managed_memory) { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c7a6bdc3ce..772a7750fe 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -106,9 +106,6 @@ namespace phi { class ErrorSummary; } // namespace phi -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DECLARE_int64(gpu_allocator_retry_time); -#endif DECLARE_int32(call_stack_level); namespace paddle { @@ -539,7 +536,7 @@ inline void retry_sleep(unsigned milliseconds) { ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \ + paddle::platform::retry_sleep(10000); \ __cond__ = (COND); \ ++retry_count; \ } \ @@ -727,7 +724,7 @@ inline void retry_sleep(unsigned millisecond) { ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \ + ::paddle::platform::retry_sleep(10000); \ __cond__ = (COND); \ ++retry_count; \ } \ diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 2c977e923b..f5cfd14e6b 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -1,5 +1,11 @@ # for paddle test case if(WITH_TESTING) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils) + set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc) + + if (WITH_GPU OR WITH_ROCM) + list(APPEND paddle_gtest_main_deps gpu_info) + endif() + + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps}) endif() diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index bb919f0e91..16c683e39f 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -20,6 +20,10 @@ limitations under the License. */ #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/init.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DECLARE_bool(enable_gpu_memory_usage_log); +#endif + int main(int argc, char** argv) { paddle::memory::allocation::UseAllocatorStrategyGFlag(); testing::InitGoogleTest(&argc, argv); @@ -81,6 +85,13 @@ int main(int argc, char** argv) { VLOG(1) << "gtest undefok_string:" << undefok_string; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (strstr(undefok_str, "enable_gpu_memory_usage_log")) { + VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true"; + FLAGS_enable_gpu_memory_usage_log = true; + } +#endif + int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); ::GFLAGS_NAMESPACE::ParseCommandLineFlags( diff --git a/tools/test_runner.py b/tools/test_runner.py index 2d0c9c4a13..7ceed18634 100644 --- a/tools/test_runner.py +++ b/tools/test_runner.py @@ -20,6 +20,7 @@ import sys import paddle import paddle.fluid as fluid import importlib +import paddle.fluid.core as core from six.moves import cStringIO sys.path.append(os.path.abspath(os.path.dirname(__file__))) @@ -28,6 +29,10 @@ import static_mode_white_list def main(): sys.path.append(os.getcwd()) + if core.is_compiled_with_cuda() or core.is_compiled_with_rocm(): + if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None): + os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true' + some_test_failed = False for module_name in sys.argv[1:]: flag_need_static_mode = False @@ -45,6 +50,7 @@ def main(): module = importlib.import_module(module_name) tests = test_loader.loadTestsFromModule(module) res = unittest.TextTestRunner(stream=buffer).run(tests) + if not res.wasSuccessful(): some_test_failed = True print( -- GitLab