[Paddle-TRT] use hook to collect shape in CollectShapeRangeInfo API. (#54841)

* commit * commit * commit * commit * final commit * use hook to collect shape and shape value

[Paddle-TRT] use hook to collect shape in CollectShapeRangeInfo API. (#54841)
* commit * commit * commit * commit * final commit * use hook to collect shape and shape value
989f3dde · 周周周 · GitHub · b6321350 · 989f3dde · 989f3dde
7 changed file
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -69,8 +69,12 @@ void NaiveExecutor::Run() {
                                platform::NvtxRangeColor::Green);
 #endif

+    for (auto &func : input_hookfuncs_) {
+      func(op.get(), scope_);
+    }
+
    if (op->Type() == "while") {
-      op->SetOutputHooks(hookfuncs_);
+      op->SetOutputHooks(output_hookfuncs_);
    }

    op->Run(*scope_, place_);
@@ -104,7 +108,7 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_INFERENCE_NVTX
    platform::CudaNvtxRangePop();
 #endif
-    for (auto &func : hookfuncs_) {
+    for (auto &func : output_hookfuncs_) {
      func(op.get(), scope_);
    }
  }
@@ -185,7 +189,11 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
 }

 void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
-  hookfuncs_.push_back(hookfunc);
+  output_hookfuncs_.push_back(hookfunc);
+}
+
+void NaiveExecutor::RegisterInputHook(const HookFunc &hookfunc) {
+  input_hookfuncs_.push_back(hookfunc);
 }

 void NaiveExecutor::MakeReusePlan(

--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -76,6 +76,7 @@ class NaiveExecutor {
  void CloneLiteEnigne(int num, void* stream);

  void RegisterOutputHook(const HookFunc& hookfunc);
+  void RegisterInputHook(const HookFunc& hookfunc);

 private:
  void CreateOps(const ProgramDesc& desc,
@@ -88,7 +89,8 @@ class NaiveExecutor {
  std::vector<std::unique_ptr<OperatorBase>> ops_;
  Scope* scope_{nullptr};

-  std::vector<HookFunc> hookfuncs_;
+  std::vector<HookFunc> output_hookfuncs_;
+  std::vector<HookFunc> input_hookfuncs_;

  // Record information that tensor_a should ShareBufferWith tensor_b.
  std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2092,6 +2092,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
  }
 #endif

+  if (config_.shape_range_info_collected()) {
+    HookCollectShapeRangeInfo();
+  }
 #ifdef PADDLE_WITH_XPU
  InferXPUContext *infer_xpu_ctx = nullptr;
  if (config_.use_xpu_ && !config_.use_lite_) {
@@ -2126,10 +2129,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
  }
 #endif

-  if (config_.shape_range_info_collected()) {
-    CollectShapeRangeInfo();
-  }
-
  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
  tensor_array_batch_cleaner_.ResetTensorArray();
@@ -2193,65 +2192,46 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
 }
 #endif

-bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
-#ifdef PADDLE_WITH_XPU
-  auto xpu_runtime_config =
-      reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
-
-  config_.xpu_config_.context = xpu_runtime_config->context;
-  auto *stream = xpu_runtime_config->stream;
-  if (stream != nullptr && stream != predictor_stream_) {
-    paddle::platform::XPUStreamSync(
-        static_cast<paddle::xpuStream>(predictor_stream_));
-    predictor_stream_ = stream;
-  }
-
-  auto l3_size = xpu_runtime_config->l3_size;
-  auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
-  PADDLE_ENFORCE_LE(
-      l3_autotune_size,
-      l3_size,
-      phi::errors::InvalidArgument(
-          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
-          l3_autotune_size,
-          l3_size));
-  config_.xpu_config_.l3_size = l3_size;
-  config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
-  config_.xpu_config_.l3_autotune_size = l3_autotune_size;
-
-  return ZeroCopyRun();
-#endif
-  return false;
-}
-
-void AnalysisPredictor::CollectShapeRangeInfo() {
-  // if use gpu, sync first.
-  paddle::platform::DeviceContextPool &pool =
-      paddle::platform::DeviceContextPool::Instance();
-  if (config_.use_gpu()) {
+void AnalysisPredictor::HookCollectShapeRangeInfo() {
+  auto hook = [&](const std::string &op_type,
+                  const std::string &input_name,
+                  const paddle::Tensor &var) -> void {
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    if (config_.use_gpu()) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto *dev_ctx = pool.Get(place_);
-    auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
+      auto *dev_ctx = pool.Get(place_);
+      auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
 #ifdef PADDLE_WITH_HIP
-    hipStreamSynchronize(stream);
+      hipStreamSynchronize(stream);
 #else
-    cudaStreamSynchronize(stream);
+      cudaStreamSynchronize(stream);
 #endif
 #endif
-  }
+    }

-  std::vector<std::string> var_names = sub_scope_->LocalVarNames();
-  for (const auto &name : var_names) {
-    auto *var = sub_scope_->GetVar(name);
-    if (!var->IsType<phi::DenseTensor>()) {
-      continue;
+    auto *new_var = sub_scope_->GetVar(input_name);
+    if (!new_var) return;
+    if (!new_var->IsType<phi::DenseTensor>()) {
+      return;
    }
-    auto tensor = var->Get<phi::DenseTensor>();
-    if (!tensor.initialized()) continue;
+    auto tensor = new_var->Get<phi::DenseTensor>();
+    if (!tensor.initialized()) return;
    framework::DDim dim = tensor.dims();
    std::vector<int32_t> shape(dim.size());
    for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
-    shape_info_[name].emplace_back(shape);
+    if (shape.size() >= 1) {
+      shape_info_[input_name].emplace_back(shape);
+    } else if (tensor.numel() > 0) {
+      // This must be a zero dimension tensor.
+      PADDLE_ENFORCE_EQ(tensor.numel(),
+                        1UL,
+                        platform::errors::PreconditionNotMet(
+                            "This tensor must have one element, but got %ld.",
+                            tensor.numel()));
+      std::vector<int32_t> zero_shape(1, 1);
+      shape_info_[input_name].emplace_back(zero_shape);
+    }

    // We need collect value range for shape tensor for Paddle-TRT's use.
    // To be noticed, this method to identify all shape tensors is based on
@@ -2296,9 +2276,41 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
                             nullptr);
 #endif
      }
-      shape_tensor_value_[name].emplace_back(int32_host);
+      shape_tensor_value_[input_name].emplace_back(int32_host);
    }
+  };
+  RegisterInputHook(hook);
+}
+
+bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
+#ifdef PADDLE_WITH_XPU
+  auto xpu_runtime_config =
+      reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
+
+  config_.xpu_config_.context = xpu_runtime_config->context;
+  auto *stream = xpu_runtime_config->stream;
+  if (stream != nullptr && stream != predictor_stream_) {
+    paddle::platform::XPUStreamSync(
+        static_cast<paddle::xpuStream>(predictor_stream_));
+    predictor_stream_ = stream;
  }
+
+  auto l3_size = xpu_runtime_config->l3_size;
+  auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
+  PADDLE_ENFORCE_LE(
+      l3_autotune_size,
+      l3_size,
+      phi::errors::InvalidArgument(
+          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
+          l3_autotune_size,
+          l3_size));
+  config_.xpu_config_.l3_size = l3_size;
+  config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
+  config_.xpu_config_.l3_autotune_size = l3_autotune_size;
+
+  return ZeroCopyRun();
+#endif
+  return false;
 }

 void AnalysisPredictor::StatisticShapeRangeInfo() {
@@ -2677,10 +2689,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
  exe.Run(save_program, scope(), 0, true, true);
 }

+void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
+  static std::once_flag register_input_hook_flag;
+  std::call_once(register_input_hook_flag, [this] {
+    executor_->RegisterInputHook(
+        [this](framework::OperatorBase *op, framework::Scope *scope) {
+          for (auto &input : op->Inputs()) {
+            for (auto &var_name : input.second) {
+              auto *var = scope->FindVar(var_name);
+              if (!var || !var->IsType<phi::DenseTensor>()) continue;
+              auto dense_tensor = var->Get<phi::DenseTensor>();
+              if (!dense_tensor.initialized()) continue;
+              auto tensor = paddle::Tensor(
+                  std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
+              for (auto &hookfunc : this->input_hookfuncs_) {
+                hookfunc(op->Type(), var_name, tensor);
+              }
+            }
+          }
+        });
+  });
+  input_hookfuncs_.push_back(hookfunc);
+}
+
 void AnalysisPredictor::RegisterOutputHook(
    const OutputTensorHookFunc &hookfunc) {
-  static std::once_flag register_hook_flag;
-  std::call_once(register_hook_flag, [this] {
+  static std::once_flag register_output_hook_flag;
+  std::call_once(register_output_hook_flag, [this] {
    executor_->RegisterOutputHook(
        [this](framework::OperatorBase *op, framework::Scope *scope) {
          for (auto &output : op->Outputs()) {
@@ -2691,14 +2726,14 @@ void AnalysisPredictor::RegisterOutputHook(
              if (!dense_tensor.initialized()) continue;
              auto tensor = paddle::Tensor(
                  std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
-              for (auto &hookfunc : this->hookfuncs_) {
+              for (auto &hookfunc : this->output_hookfuncs_) {
                hookfunc(op->Type(), var_name, tensor);
              }
            }
          }
        });
  });
-  hookfuncs_.push_back(hookfunc);
+  output_hookfuncs_.push_back(hookfunc);
 }

 template <>
@@ -2987,6 +3022,9 @@ uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
 void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) {
  predictor_->RegisterOutputHook(hookfunc);
 }
+void Predictor::RegisterInputHook(const OutputTensorHookFunc &hookfunc) {
+  predictor_->RegisterInputHook(hookfunc);
+}

 void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }


--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -328,6 +328,9 @@ class AnalysisPredictor : public PaddlePredictor {
  ///
  void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override;

+  /// \brief Same as RegisterOutputHook
+  void RegisterInputHook(const InputTensorHookFunc &hookfunc) override;
+
  ///
  /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
  ///
@@ -498,8 +501,7 @@ class AnalysisPredictor : public PaddlePredictor {

 private:
  void StatisticShapeRangeInfo();
-  void CollectShapeRangeInfo();
-
+  void HookCollectShapeRangeInfo();
  void InitPlace();
  void InitDeviceContexts();
  void InitResourceManager(void *stream);
@@ -598,7 +600,8 @@ class AnalysisPredictor : public PaddlePredictor {

 private:
  std::vector<OutputTensorHookFunc> hookfuncs_;
-
+  std::vector<OutputTensorHookFunc> output_hookfuncs_;
+  std::vector<InputTensorHookFunc> input_hookfuncs_;
  // Some status here that help to determine the status inside the predictor.
  bool status_is_cloned_{false};


--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -38,6 +38,7 @@ namespace paddle {
 using PaddleDType = paddle_infer::DataType;
 using PaddlePlace = paddle_infer::PlaceType;
 using PaddleDataLayout = paddle_infer::DataLayout;
+using paddle_infer::InputTensorHookFunc;
 using paddle_infer::OutputTensorHookFunc;

 /// \brief Memory manager for PaddleTensor.
@@ -323,6 +324,9 @@ class PD_INFER_DECL PaddlePredictor {
  ///
  virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {}

+  /// \brief Same as RegisterOutputHook
+  virtual void RegisterInputHook(const InputTensorHookFunc& hookfunc) {}
+
  /// \brief Clone an existing predictor
  /// When using clone, the same network will be created,
  /// and the parameters between them are shared.

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -200,6 +200,9 @@ class PD_INFER_DECL Predictor {
  ///
  void RegisterOutputHook(const OutputTensorHookFunc& hookfunc);

+  /// The same as RegisterOutputHook.
+  void RegisterInputHook(const InputTensorHookFunc& hookfunc);
+
  ///
  /// \brief Get the execution stream on devices with a concept of stream,
  /// otherwise returns nullptr.

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -38,6 +38,7 @@ using Strings = std::vector<std::string>;

 using OutputTensorHookFunc = std::function<void(
    const std::string&, const std::string&, const paddle::Tensor&)>;
+using InputTensorHookFunc = OutputTensorHookFunc;

 typedef void (*CallbackFunc)(void*);