未验证 提交 28375ca4 编写于 作者: R Ruibiao Chen 提交者: GitHub

Print memory peak message for UT (#42092)

* Add peak memory log for CI

* Change VLOG to std::cout

* Move print code to test_runner.py and paddle_gtest_main.cc

* Fix typo

* Fix conflicts

* Updata message format

* Fix CI errors

* Add FLAGS_enable_gpu_memory_usage_log

* Fix CI errors
上级 e8e3b997
......@@ -107,7 +107,7 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
break
#define MEMORY_STAT_FUNC(item, id, func, ...) \
do { \
[&] { \
paddle::memory::StatBase* stat = nullptr; \
switch (id) { \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \
......@@ -133,8 +133,8 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
id)); \
break; \
} \
stat->func(__VA_ARGS__); \
} while (0)
return stat->func(__VA_ARGS__); \
}()
#define MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetCurrentValue)
......
......@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
......@@ -49,6 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb);
#ifdef PADDLE_WITH_TESTING
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
"Whether to print the message of gpu memory usage "
"at exit, mainly used for UT and CI.");
#endif
constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT;
......@@ -137,12 +144,31 @@ class RecordedGpuMallocHelper {
if (NeedRecord()) {
mtx_.reset(new std::mutex());
}
#ifdef PADDLE_WITH_TESTING
if (FLAGS_enable_gpu_memory_usage_log) {
// A fake UPDATE to trigger the construction of memory stat instances,
// make sure that they are destructed after RecordedGpuMallocHelper.
MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
}
#endif
}
DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
public:
~RecordedGpuMallocHelper() {
#ifdef PADDLE_WITH_TESTING
if (FLAGS_enable_gpu_memory_usage_log) {
std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
<< MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
}
#endif
}
static RecordedGpuMallocHelper *Instance(int dev_id) {
static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
std::call_once(once_flag_, [] {
int dev_cnt = GetGPUDeviceCount();
instances_.reserve(dev_cnt);
......@@ -326,14 +352,11 @@ class RecordedGpuMallocHelper {
mutable std::unique_ptr<std::mutex> mtx_;
static std::once_flag once_flag_;
static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
std::set<void *> gpu_ptrs; // just for testing
}; // NOLINT
std::once_flag RecordedGpuMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
RecordedGpuMallocHelper::instances_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
bool malloc_managed_memory) {
......
......@@ -106,9 +106,6 @@ namespace phi {
class ErrorSummary;
} // namespace phi
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_int32(call_stack_level);
namespace paddle {
......@@ -539,7 +536,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
paddle::platform::retry_sleep(10000); \
__cond__ = (COND); \
++retry_count; \
} \
......@@ -727,7 +724,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
::paddle::platform::retry_sleep(10000); \
__cond__ = (COND); \
++retry_count; \
} \
......
# for paddle test case
if(WITH_TESTING)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils)
set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc)
if (WITH_GPU OR WITH_ROCM)
list(APPEND paddle_gtest_main_deps gpu_info)
endif()
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps})
endif()
......@@ -20,6 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/init.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_bool(enable_gpu_memory_usage_log);
#endif
int main(int argc, char** argv) {
paddle::memory::allocation::UseAllocatorStrategyGFlag();
testing::InitGoogleTest(&argc, argv);
......@@ -81,6 +85,13 @@ int main(int argc, char** argv) {
VLOG(1) << "gtest undefok_string:" << undefok_string;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
FLAGS_enable_gpu_memory_usage_log = true;
}
#endif
int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data();
::GFLAGS_NAMESPACE::ParseCommandLineFlags(
......
......@@ -20,6 +20,7 @@ import sys
import paddle
import paddle.fluid as fluid
import importlib
import paddle.fluid.core as core
from six.moves import cStringIO
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
......@@ -28,6 +29,10 @@ import static_mode_white_list
def main():
sys.path.append(os.getcwd())
if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
some_test_failed = False
for module_name in sys.argv[1:]:
flag_need_static_mode = False
......@@ -45,6 +50,7 @@ def main():
module = importlib.import_module(module_name)
tests = test_loader.loadTestsFromModule(module)
res = unittest.TextTestRunner(stream=buffer).run(tests)
if not res.wasSuccessful():
some_test_failed = True
print(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册