未验证 提交 1967c6a6 编写于 作者: W Wilber 提交者: GitHub

enable memory optimize when fp16. (#45792)

上级 8f37c66f
...@@ -292,6 +292,7 @@ bool AnalysisPredictor::Init( ...@@ -292,6 +292,7 @@ bool AnalysisPredictor::Init(
} }
} }
#endif #endif
inference::DisplayMemoryInfo(place_, "Init predictor");
return true; return true;
} }
...@@ -1050,14 +1051,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1050,14 +1051,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetUseFcPadding(config_.use_fc_padding()); argument_.SetUseFcPadding(config_.use_fc_padding());
argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_); argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
if (model_precision_ == phi::DataType::FLOAT32) {
argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
} else {
// TODO(inference): mixed precision temporarily not support memory_optim
LOG_FIRST_N(WARNING, 1) << "mixed precision model temporarily not support "
"memory optim, so we just turn off that.";
argument_.SetEnableMemoryOptim(false);
}
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
argument_.SetPredictorID(predictor_id_); argument_.SetPredictorID(predictor_id_);
...@@ -1622,6 +1616,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -1622,6 +1616,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
} }
bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::ZeroCopyRun() {
inference::DisplayMemoryInfo(place_, "before run");
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
if (config_.dist_config().use_dist_model()) { if (config_.dist_config().use_dist_model()) {
VLOG(3) << "ZeroCopyRun will use the fleet executor."; VLOG(3) << "ZeroCopyRun will use the fleet executor.";
...@@ -1659,6 +1654,7 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -1659,6 +1654,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
#endif #endif
executor_->Run(); executor_->Run();
inference::DisplayMemoryInfo(place_, "after run");
if (config_.shape_range_info_collected()) { if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo(); CollectShapeRangeInfo();
......
...@@ -31,7 +31,9 @@ ...@@ -31,7 +31,9 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/dynload/port.h"
...@@ -421,5 +423,44 @@ static bool IsFileExists(const std::string &path) { ...@@ -421,5 +423,44 @@ static bool IsFileExists(const std::string &path) {
void RegisterAllCustomOperator(); void RegisterAllCustomOperator();
static inline double ToMegaBytes(size_t bytes) {
return static_cast<double>(bytes) / (1 << 20);
}
static inline void DisplayMemoryInfo(platform::Place place,
const std::string &hint) {
#ifdef PADDLE_WITH_CUDA
// size_t free, total;
// cudaSetDevice(place.GetDeviceId());
// cudaMemGetInfo(&free, &total);
// VLOG(1) << "[" << ToMegaBytes(total - free) << "MB/" << ToMegaBytes(total)
// << "MB]";
VLOG(1) << hint << " : [gpu current allocated memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
"Allocated", place.GetDeviceId()))
<< "MB], [gpu current reserved memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
"Reserved", place.GetDeviceId()))
<< "MB], [gpu peak allocated memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
"Allocated", place.GetDeviceId()))
<< "MB], [gpu peak reserved memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
"Reserved", place.GetDeviceId()))
<< "MB]";
#endif
VLOG(1)
<< hint << " : [cpu current allocated memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Allocated", 0))
<< "MB], [cpu current reserved memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Reserved", 0))
<< "MB], [cpu peak allocated memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Allocated", 0))
<< "MB], [cpu peak reserved memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Reserved", 0))
<< "MB]";
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -575,7 +575,7 @@ struct GPUContext::Impl { ...@@ -575,7 +575,7 @@ struct GPUContext::Impl {
if (!blas_tensor_core_handle_creator_) { if (!blas_tensor_core_handle_creator_) {
phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
} else { } else {
phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
} }
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册