未验证 提交 1967c6a6 编写于 作者: W Wilber 提交者: GitHub

enable memory optimize when fp16. (#45792)

上级 8f37c66f
......@@ -292,6 +292,7 @@ bool AnalysisPredictor::Init(
}
}
#endif
inference::DisplayMemoryInfo(place_, "Init predictor");
return true;
}
......@@ -1050,14 +1051,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetUseFcPadding(config_.use_fc_padding());
argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
if (model_precision_ == phi::DataType::FLOAT32) {
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
} else {
// TODO(inference): mixed precision temporarily not support memory_optim
LOG_FIRST_N(WARNING, 1) << "mixed precision model temporarily not support "
"memory optim, so we just turn off that.";
argument_.SetEnableMemoryOptim(false);
}
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
argument_.SetPredictorID(predictor_id_);
......@@ -1622,6 +1616,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
}
bool AnalysisPredictor::ZeroCopyRun() {
inference::DisplayMemoryInfo(place_, "before run");
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
if (config_.dist_config().use_dist_model()) {
VLOG(3) << "ZeroCopyRun will use the fleet executor.";
......@@ -1659,6 +1654,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
#endif
executor_->Run();
inference::DisplayMemoryInfo(place_, "after run");
if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
......
......@@ -31,7 +31,9 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/backends/dynload/port.h"
......@@ -421,5 +423,44 @@ static bool IsFileExists(const std::string &path) {
void RegisterAllCustomOperator();
static inline double ToMegaBytes(size_t bytes) {
return static_cast<double>(bytes) / (1 << 20);
}
static inline void DisplayMemoryInfo(platform::Place place,
const std::string &hint) {
#ifdef PADDLE_WITH_CUDA
// size_t free, total;
// cudaSetDevice(place.GetDeviceId());
// cudaMemGetInfo(&free, &total);
// VLOG(1) << "[" << ToMegaBytes(total - free) << "MB/" << ToMegaBytes(total)
// << "MB]";
VLOG(1) << hint << " : [gpu current allocated memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
"Allocated", place.GetDeviceId()))
<< "MB], [gpu current reserved memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
"Reserved", place.GetDeviceId()))
<< "MB], [gpu peak allocated memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
"Allocated", place.GetDeviceId()))
<< "MB], [gpu peak reserved memory: "
<< ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
"Reserved", place.GetDeviceId()))
<< "MB]";
#endif
VLOG(1)
<< hint << " : [cpu current allocated memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Allocated", 0))
<< "MB], [cpu current reserved memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Reserved", 0))
<< "MB], [cpu peak allocated memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Allocated", 0))
<< "MB], [cpu peak reserved memory: "
<< ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Reserved", 0))
<< "MB]";
}
} // namespace inference
} // namespace paddle
......@@ -575,7 +575,7 @@ struct GPUContext::Impl {
if (!blas_tensor_core_handle_creator_) {
phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
} else {
phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
}
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册