enable memory optimize when fp16. (#45792)

1967c6a6 · Wilber · GitHub · 8f37c66f · 1967c6a6 · 1967c6a6
3 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -292,6 +292,7 @@ bool AnalysisPredictor::Init(
    }
  }
 #endif
+  inference::DisplayMemoryInfo(place_, "Init predictor");
  return true;
 }

@@ -1050,14 +1051,7 @@ void AnalysisPredictor::PrepareArgument() {
  argument_.SetUseFcPadding(config_.use_fc_padding());
  argument_.SetGPUDeviceId(config_.gpu_device_id());
  argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
-  if (model_precision_ == phi::DataType::FLOAT32) {
-    argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
-  } else {
-    // TODO(inference): mixed precision temporarily not support memory_optim
-    LOG_FIRST_N(WARNING, 1) << "mixed precision model temporarily not support "
-                               "memory optim, so we just turn off that.";
-    argument_.SetEnableMemoryOptim(false);
-  }
+  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  argument_.SetPredictorID(predictor_id_);
@@ -1622,6 +1616,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }

 bool AnalysisPredictor::ZeroCopyRun() {
+  inference::DisplayMemoryInfo(place_, "before run");
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  if (config_.dist_config().use_dist_model()) {
    VLOG(3) << "ZeroCopyRun will use the fleet executor.";
@@ -1659,6 +1654,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #endif

  executor_->Run();
+  inference::DisplayMemoryInfo(place_, "after run");

  if (config_.shape_range_info_collected()) {
    CollectShapeRangeInfo();

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -31,7 +31,9 @@

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/backends/dynload/port.h"

@@ -421,5 +423,44 @@ static bool IsFileExists(const std::string &path) {

 void RegisterAllCustomOperator();

+static inline double ToMegaBytes(size_t bytes) {
+  return static_cast<double>(bytes) / (1 << 20);
+}
+
+static inline void DisplayMemoryInfo(platform::Place place,
+                                     const std::string &hint) {
+#ifdef PADDLE_WITH_CUDA
+  // size_t free, total;
+  // cudaSetDevice(place.GetDeviceId());
+  // cudaMemGetInfo(&free, &total);
+  // VLOG(1) << "[" << ToMegaBytes(total - free) << "MB/" << ToMegaBytes(total)
+  // << "MB]";
+
+  VLOG(1) << hint << " : [gpu current allocated memory: "
+          << ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
+                 "Allocated", place.GetDeviceId()))
+          << "MB], [gpu current reserved memory: "
+          << ToMegaBytes(paddle::memory::DeviceMemoryStatCurrentValue(
+                 "Reserved", place.GetDeviceId()))
+          << "MB], [gpu peak allocated memory: "
+          << ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
+                 "Allocated", place.GetDeviceId()))
+          << "MB], [gpu peak reserved memory: "
+          << ToMegaBytes(paddle::memory::DeviceMemoryStatPeakValue(
+                 "Reserved", place.GetDeviceId()))
+          << "MB]";
+#endif
+  VLOG(1)
+      << hint << " : [cpu current allocated memory: "
+      << ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Allocated", 0))
+      << "MB], [cpu current reserved memory: "
+      << ToMegaBytes(paddle::memory::HostMemoryStatCurrentValue("Reserved", 0))
+      << "MB], [cpu peak allocated memory: "
+      << ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Allocated", 0))
+      << "MB], [cpu peak reserved memory: "
+      << ToMegaBytes(paddle::memory::HostMemoryStatPeakValue("Reserved", 0))
+      << "MB]";
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -575,7 +575,7 @@ struct GPUContext::Impl {
        if (!blas_tensor_core_handle_creator_) {
          phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
        } else {
-          phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
+          blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
        }
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));