From 1967c6a69f56b773ffa3eec0af78b1428c469f49 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 6 Sep 2022 18:54:33 +0800 Subject: [PATCH] enable memory optimize when fp16. (#45792) --- .../fluid/inference/api/analysis_predictor.cc | 12 ++---- paddle/fluid/inference/api/helper.h | 41 +++++++++++++++++++ paddle/phi/backends/gpu/gpu_context.cc | 2 +- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9feca9620f..17f1396ce8 100755 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -292,6 +292,7 @@ bool AnalysisPredictor::Init( } } #endif + inference::DisplayMemoryInfo(place_, "Init predictor"); return true; } @@ -1050,14 +1051,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetUseFcPadding(config_.use_fc_padding()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_); - if (model_precision_ == phi::DataType::FLOAT32) { - argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - } else { - // TODO(inference): mixed precision temporarily not support memory_optim - LOG_FIRST_N(WARNING, 1) << "mixed precision model temporarily not support " - "memory optim, so we just turn off that."; - argument_.SetEnableMemoryOptim(false); - } + argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program argument_.SetPredictorID(predictor_id_); @@ -1622,6 +1616,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { + inference::DisplayMemoryInfo(place_, "before run"); #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "ZeroCopyRun will use the fleet executor."; @@ -1659,6 +1654,7 @@ bool AnalysisPredictor::ZeroCopyRun() { #endif executor_->Run(); + inference::DisplayMemoryInfo(place_, "after run"); if (config_.shape_range_info_collected()) { CollectShapeRangeInfo(); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 471b95d525..e3b1453812 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -31,7 +31,9 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/backends/dynload/port.h" @@ -421,5 +423,44 @@ static bool IsFileExists(const std::string &path) { void RegisterAllCustomOperator(); +static inline double ToMegaBytes(size_t bytes) { + return static_cast(bytes) / (1 << 20); +} + +static inline void DisplayMemoryInfo(platform::Place place, + const std::string &hint) { +#ifdef PADDLE_WITH_CUDA + // size_t free, total; + // cudaSetDevice(place.GetDeviceId()); + // cudaMemGetInfo(&free, &total); + // VLOG(1) << "[" << ToMegaBytes(total - free) << "MB/" << ToMegaBytes(total) + // << "MB]"; + + VLOG(1) << hint << " : [gpu current allocated memory: " + << ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue( + "Allocated", place.GetDeviceId())) + << "MB], [gpu current reserved memory: " + << ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue( + "Reserved", place.GetDeviceId())) + << "MB], [gpu peak allocated memory: " + << ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue( + "Allocated", place.GetDeviceId())) + << "MB], [gpu peak reserved memory: " + << ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue( + "Reserved", place.GetDeviceId())) + << "MB]"; +#endif + VLOG(1) + << hint << " : [cpu current allocated memory: " + << ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Allocated", 0)) + << "MB], [cpu current reserved memory: " + << ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Reserved", 0)) + << "MB], [cpu peak allocated memory: " + << ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Allocated", 0)) + << "MB], [cpu peak reserved memory: " + << ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Reserved", 0)) + << "MB]"; +} + } // namespace inference } // namespace paddle diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 87d779f919..cf08f5b4af 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -575,7 +575,7 @@ struct GPUContext::Impl { if (!blas_tensor_core_handle_creator_) { phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); } else { - phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); + blas_tensor_core_handle_ = blas_tensor_core_handle_creator_(); } PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); -- GitLab