Update DlNNE engine (#45027)

* add config param for enable_dlnne and support calibration mode * remove useless file * refine code and add annotation * refine code of Warnning tips

Update DlNNE engine (#45027)
* add config param for enable_dlnne and support calibration mode * remove useless file * refine code and add annotation * refine code of Warnning tips
638965c5 · denglin-github · GitHub · d7d9807e · 638965c5 · 638965c5
13 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -250,6 +250,22 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_use_static_batch, DlnneUseStaticBatch, bool);
+  DECL_ARGUMENT_FIELD(dlnne_weight_share_mode,
+                      DlnneWeightShareMode,
+                      std::string);
+  DECL_ARGUMENT_FIELD(dlnne_disable_nodes_by_outputs,
+                      DlnneDisableNodesByOutputs,
+                      std::unordered_set<std::string>);
+  DECL_ARGUMENT_FIELD(dlnne_use_calib_mode, DlnneUseCalibMode, bool);
+  DECL_ARGUMENT_FIELD(dlnne_precision_mode,
+                      DlnnePrecisionMode,
+                      AnalysisConfig::Precision);
+
+  using dlnne_input_shape_type = std::map<std::string, std::vector<int64_t>>;
+  DECL_ARGUMENT_FIELD(dlnne_input_shape_dict,
+                      DlnneInputShapeDict,
+                      dlnne_input_shape_type);
  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);

  DECL_ARGUMENT_FIELD(lite_passes_filter,

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -209,8 +209,23 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("disable_trt_plugin_fp16",
                new bool(argument->disable_trt_plugin_fp16()));
    } else if (pass_name == "dlnne_subgraph_pass") {
+      auto precision_mode = argument->dlnne_precision_mode();
      pass->Set("min_subgraph_size",
                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("max_batch_size", new int(argument->dlnne_max_batch_size()));
+      pass->Set("use_static_batch",
+                new bool(argument->dlnne_use_static_batch()));
+      pass->Set("weight_share_mode",
+                new std::string(argument->dlnne_weight_share_mode()));
+      pass->Set("disable_nodes_by_outputs",
+                new std::unordered_set<std::string>(
+                    argument->dlnne_disable_nodes_by_outputs()));
+      pass->Set("use_calib_mode", new bool(argument->dlnne_use_calib_mode()));
+      pass->Set("precision_mode",
+                new AnalysisConfig::Precision(precision_mode));
+      pass->Set("input_shape_dict",
+                new std::map<std::string, std::vector<int64_t>>(
+                    argument->dlnne_input_shape_dict()));
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
    }

--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-namespace paddle {
-namespace inference {
-
-int RegisterPyFunc(const std::string& name, void* pfn);
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -34,9 +34,6 @@ class Node;

 namespace paddle {
 namespace inference {
-
-int ConvertGraph(std::string graph_name);
-
 namespace analysis {

 class DlnneSubgraphPass : public framework::ir::FusePassBase {
@@ -44,6 +41,8 @@ class DlnneSubgraphPass : public framework::ir::FusePassBase {
  void ApplyImpl(framework::ir::Graph *graph) const override;

 private:
+  void InferShapeForDlnneMainGraph() const;
+  bool IsDynamicOp(std::string var_name, bool use_static_batch) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
  void CreateDlnneOp(framework::ir::Node *x,
                     framework::ir::Graph *graph,

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -283,6 +283,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // Dlnne related
  CP_MEMBER(use_dlnne_);
  CP_MEMBER(dlnne_min_subgraph_size_);
+  CP_MEMBER(dlnne_max_batchsize_);
+  CP_MEMBER(dlnne_use_static_batch_);
+  CP_MEMBER(dlnne_weight_share_mode_);
+  CP_MEMBER(dlnne_use_calib_mode_);
+  CP_MEMBER(dlnne_precision_mode_);
+  CP_MEMBER(dlnne_disable_nodes_by_outputs_);
+  CP_MEMBER(dlnne_input_shape_dict_);
  // MKLDNN related.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
@@ -544,9 +551,24 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }

-void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+void AnalysisConfig::EnableDlnne(
+    int min_subgraph_size,
+    int max_batch_size,
+    bool use_static_batch,
+    std::string weight_share_mode,
+    std::unordered_set<std::string> disable_nodes_by_ouputs,
+    std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
+    bool use_calib_mode,
+    AnalysisConfig::Precision precision_mode) {
  use_dlnne_ = true;
  dlnne_min_subgraph_size_ = min_subgraph_size;
+  dlnne_max_batchsize_ = max_batch_size;
+  dlnne_use_static_batch_ = use_static_batch;
+  dlnne_weight_share_mode_ = weight_share_mode;
+  dlnne_disable_nodes_by_outputs_ = disable_nodes_by_ouputs;
+  dlnne_input_shape_dict_ = dlnne_input_shape_dict;
+  dlnne_use_calib_mode_ = use_calib_mode;
+  dlnne_precision_mode_ = precision_mode;
  Update();
 }


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1107,6 +1107,14 @@ void AnalysisPredictor::PrepareArgument() {
    LOG(INFO) << "Dlnne subgraph is enabled";
    argument_.SetUseDlnne(true);
    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+    argument_.SetDlnneMaxBatchSize(config_.dlnne_max_batchsize_);
+    argument_.SetDlnneUseStaticBatch(config_.dlnne_use_static_batch_);
+    argument_.SetDlnneWeightShareMode(config_.dlnne_weight_share_mode_);
+    argument_.SetDlnneDisableNodesByOutputs(
+        config_.dlnne_disable_nodes_by_outputs_);
+    argument_.SetDlnneInputShapeDict(config_.dlnne_input_shape_dict_);
+    argument_.SetDlnneUseCalibMode(config_.dlnne_use_calib_mode_);
+    argument_.SetDlnnePrecisionMode(config_.dlnne_precision_mode_);
  }

  if (config_.lite_engine_enabled()) {

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -663,7 +663,15 @@ struct PD_INFER_DECL AnalysisConfig {
  void EnableTensorRtInspector();
  bool tensorrt_inspector_enabled() { return trt_use_inspector_; }

-  void EnableDlnne(int min_subgraph_size = 3);
+  void EnableDlnne(
+      int min_subgraph_size = 3,
+      int max_batch_size = 1,
+      bool use_static_batch = false,
+      std::string weight_share_mode = "0",
+      std::unordered_set<std::string> disable_nodes_by_outputs = {},
+      std::map<std::string, std::vector<int64_t>> input_dict = {},
+      bool use_calib_mode = false,
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32);
  bool dlnne_enabled() const { return use_dlnne_; }

  ///
@@ -1006,6 +1014,13 @@ struct PD_INFER_DECL AnalysisConfig {
  // dlnne related.
  bool use_dlnne_{false};
  int dlnne_min_subgraph_size_{3};
+  int dlnne_max_batchsize_{1};
+  std::unordered_set<std::string> dlnne_disable_nodes_by_outputs_;
+  bool dlnne_use_static_batch_{true};
+  std::string dlnne_weight_share_mode_;
+  std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict_{};
+  bool dlnne_use_calib_mode_{false};
+  Precision dlnne_precision_mode_{Precision::kFloat32};

  // memory reuse related.
  bool enable_memory_optim_{false};

--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -269,12 +269,28 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
  return config->config.tensorrt_engine_enabled();
 }

-void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      config,
-      paddle::platform::errors::InvalidArgument(
-          "The pointer of analysis configuration shouldn't be nullptr"));
-  config->config.EnableDlnne(min_subgraph_size);
+void PD_EnableDlnne(
+    PD_AnalysisConfig* config,
+    int min_subgraph_size,
+    int max_batch_size,
+    bool use_static_batch,
+    std::string weight_share_mode,
+    std::unordered_set<std::string> disable_nodes_by_ouputs,
+    std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
+    bool use_calib_mode,
+    AnalysisConfig::Precision precision_mode) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size,
+                             max_batch_size,
+                             use_static_batch,
+                             weight_share_mode,
+                             disable_nodes_by_ouputs,
+                             dlnne_input_shape_dict,
+                             use_calib_mode,
+                             precision_mode);
 }

 bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {

--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -9,21 +9,19 @@ endforeach()
 # add nne
 find_path(
  DLNNE_INCLUDE_DIR dlnne.h
-  PATHS $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/include/dlnne
  NO_DEFAULT_PATH)

 find_library(
  DLNNE_LIB libdlnne.so
-  PATHS $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
  NO_DEFAULT_PATH)

-find_path(CUDA_INCLUDE_DIR cuda.h
-          $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include)
+find_path(CUDA_INCLUDE_DIR cuda.h $ENV{DL_SDK_DIR}/include)

 find_library(
  CURT_LIB libcurt.so
-  PATHS $ENV{SOFTWARE_BUILD_DIR}
-        $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
  NO_DEFAULT_PATH)

 message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})

--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -28,6 +28,105 @@ void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
  cudaDeviceSynchronize();
 }

+std::string ConvertType(paddle::experimental::DataType type) {
+  switch (type) {
+    case paddle::experimental::DataType::FLOAT32: {
+      return "float32";
+    }
+    case paddle::experimental::DataType::INT64: {
+      return "int64";
+    }
+    case paddle::experimental::DataType::INT32: {
+      return "int32";
+    }
+    case paddle::experimental::DataType::FLOAT16: {
+      return "float16";
+    }
+    default: {
+      PADDLE_THROW(
+          platform::errors::Fatal("The DLNNE Calibration only support "
+                                  "float/float16/int32_t/int64_t input."));
+    }
+  }
+}
+
+int GetDataByte(paddle::experimental::DataType type) {
+  switch (type) {
+    case paddle::experimental::DataType::FLOAT32: {
+      return 4;
+    }
+    case paddle::experimental::DataType::INT64: {
+      return 8;
+    }
+    case paddle::experimental::DataType::INT32: {
+      return 4;
+    }
+    case paddle::experimental::DataType::FLOAT16: {
+      return 2;
+    }
+    default: {
+      PADDLE_THROW(
+          platform::errors::Fatal("The DLNNE Calibration only support "
+                                  "float/float16/int32_t/int64_t input."));
+    }
+  }
+}
+
+std::string GenerateRandomKey() {
+  std::string str(
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+  std::random_device rd;
+  std::mt19937 generator(rd());
+
+  std::shuffle(str.begin(), str.end(), generator);
+  return str.substr(0, 32);
+}
+
+void ConvertPaddle2Onnx(std::string onnx_file_name,
+                        std::string subgraph_root_path) {
+  if (!FileExists(onnx_file_name.c_str())) {
+    std::stringstream convert_cmd;
+    convert_cmd << "paddle2onnx --model_dir " << subgraph_root_path
+                << " --save_file " << onnx_file_name << " --opset_version 11";
+    LOG(INFO) << convert_cmd.str();
+    int convert_flag = system(convert_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(
+        convert_flag,
+        0,
+        platform::errors::Unavailable("Convert paddle to onnx failed"));
+  }
+}
+
+void QuantizeOnnx(std::string onnx_file_name,
+                  std::string rlym_file_name,
+                  std::string quantized_rlym_file_name,
+                  std::string dataset_path,
+                  std::string dataset_plugin_path) {
+  if (!FileExists(rlym_file_name.c_str())) {
+    std::stringstream convert_cmd;
+    convert_cmd << "python -m dl convert " << onnx_file_name
+                << " --output-model " << rlym_file_name;
+    LOG(INFO) << convert_cmd.str();
+    int convert_flag = system(convert_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(
+        convert_flag,
+        0,
+        platform::errors::Unavailable("Convert onnx to rlym failed"));
+  }
+
+  if (!FileExists(quantized_rlym_file_name.c_str())) {
+    std::stringstream quantize_cmd;
+    quantize_cmd << "python -m dl quantize "
+                 << "--dataset " << dataset_path << " --plugin "
+                 << dataset_plugin_path << " " << rlym_file_name;
+    LOG(INFO) << quantize_cmd.str();
+    int quantize_flag = system(quantize_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(quantize_flag,
+                      0,
+                      platform::errors::Unavailable("quantize model failed"));
+  }
+}
+
 }  // namespace inference

 namespace operators {
@@ -41,7 +140,23 @@ class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::string>(
        "engine_key",
        "The engine_key here is used to distinguish different DLNNE Engines");
-    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddAttr<int32_t>("max_batch_size", "engine max_batch_size");
+    AddAttr<bool>("use_static_batch", "static batch fix for [?,H,W,C]");
+    AddAttr<std::string>("weight_share_mode",
+                         "dlnne weight_share_mode, can be '0', '1', '2', '3', "
+                         "'01', '23', '0123' ");
+    // when use_calib_mode is true and enable_int8 is true,
+    // the calibration_runtime start,
+    // when calibration_mode is true, the calibration_runtiime
+    // go to the first stage of calibration, and when finish
+    // fisrt stage, the calibration_mode is set false, the
+    // calibration_runtime go to the second stage
+    AddAttr<bool>("use_calib_mode", "dlnne use calib mode");
+    AddAttr<bool>("enable_int8", "dlnne enable int8");
+    AddAttr<bool>("calibration_mode", "dlnne calibration_mode");
+    AddAttr<std::string>("calibration_data_path", "calibration data path");
+    AddAttr<std::string>("subgraph_root_path", "subgraph root path");
+    AddAttr<framework::BlockDesc*>("sub_block", "the dlnne block");
    AddComment("Dlnne engine operator.");
  }
 };

--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -730,7 +730,16 @@ void BindAnalysisConfig(py::module *m) {
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("enable_dlnne",
           &AnalysisConfig::EnableDlnne,
-           py::arg("min_subgraph_size") = 3)
+           py::arg("min_subgraph_size") = 3,
+           py::arg("max_batch_size") = 1,
+           py::arg("use_static_batch") = false,
+           py::arg("weight_share_mode") = "0",
+           py::arg("disable_nodes_by_outputs") =
+               std::unordered_set<std::string>(),
+           py::arg("input_shape_dict") =
+               std::map<std::string, std::vector<int64_t>>(),
+           py::arg("use_calib_mode") = false,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
      .def("enable_lite_engine",
           &AnalysisConfig::EnableLiteEngine,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,