cherry-pick from feature/anakin-engine: add batch interface for pd-anakin #16178

c79f06d3 · nhzlx · 69d37f81 · c79f06d3 · c79f06d3 · c79f06d3
10 changed file
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -90,10 +90,12 @@ class AnakinOpConverter {
      for (int i = 0; i < var_shape.size(); i++) {
        input_shape.push_back(var_shape[i]);
      }
-      input_shape[0] = 1;
+      input_shape[0] = engine->GetMaxBatch();
      engine->SetInputShape(input, input_shape);
    }
+    // engine->Graph()->RegistAllOut();
    engine->Optimize();
    engine->InitGraph();
  }

--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -34,10 +34,12 @@ namespace anakin {
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary,
-                                                            int device)
+                                                            int device,
+                                                            int max_batch_size)
    : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
  device_ = device;
+  max_batch_size_ = max_batch_size;
 }
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
@@ -71,8 +73,8 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
  for (const auto &input : inputs) {
    auto *tensor = input.second;
    auto *data = tensor->data<float>();
-    auto fluid_input_shape = framework::vectorize2int(tensor->dims());
+    auto fluid_input_shape = framework::vectorize2int(tensor->dims());
    auto *anakin_input = net_->get_in(input.first);
    auto net_shape = anakin_input->shape();
    if (tensor->numel() > net_shape.count()) {
@@ -84,11 +86,13 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
    anakin_input->reshape(fluid_input_shape);
    net_shape = anakin_input->shape();
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       net_shape);
+                                                       // net_shape);
-    anakin_input->share_from(tmp_anakin_tensor);
+                                                       fluid_input_shape);
+    anakin_input->copy_from(tmp_anakin_tensor);
  }
+  cudaDeviceSynchronize();
  net_->prediction();
  for (const auto &output : outputs) {
    platform::CUDAPlace gpu_place(device_);
@@ -98,12 +102,10 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
    auto anakin_output_shape = anakin_output->valid_shape();
    tensor->Resize(framework::make_ddim(anakin_output_shape));
    auto *fluid_data = tensor->mutable_data<float>(gpu_place);
    memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
                 static_cast<void *>(anakin_data),
                 tensor->numel() * sizeof(float), stream);
  }
  cudaDeviceSynchronize();
 }

--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -55,7 +55,8 @@ class AnakinEngine {
  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
 public:
-  explicit AnakinEngine(bool need_summary = false, int device = 0);
+  explicit AnakinEngine(bool need_summary = false, int device = 0,
+                        int max_batch_size = 1);
  ~AnakinEngine();
  void InitGraph();
  void SetInputShape(const std::string &name, std::vector<int> shape);
@@ -70,10 +71,12 @@ class AnakinEngine {
                   "Add operation's attribution.");
  }
  NetT *Net() { return net_.get(); }
+  GraphT *Graph() { return graph_.get(); }
  std::unique_ptr<AnakinEngine> Clone();
  void Freeze();
  void Optimize();
  void Save(std::string path) { graph_->save(path); }
+  int GetMaxBatch() { return max_batch_size_; }
  // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
  // }
  // void LoadSerializedData(const std::string& data) {
@@ -83,6 +86,7 @@ class AnakinEngine {
               cudaStream_t stream);
 private:
+  int max_batch_size_;
  int device_;
  std::unique_ptr<GraphT> graph_;
  std::unique_ptr<NetT> net_;
@@ -100,10 +104,11 @@ class AnakinEngineManager {
    return engines_.at(name).get();
  }
-  AnakinNvEngineT *Create(bool need_summary, int device,
+  AnakinNvEngineT *Create(bool need_summary, int device, int max_batch_size,
                          std::string engine_name) {
    std::unique_lock<std::mutex> lk(mut_);
-    auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device);
+    auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device,
+                                                    max_batch_size);
    engines_[engine_name].reset(p);
    return p;
  }

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -150,6 +150,7 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                      bool);
+  DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
  // Memory optimized related.

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -77,6 +77,7 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
                                       argument->engine_opt_info()));
      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
    }
    if (pass_name == "tensorrt_subgraph_pass") {
@@ -91,16 +92,20 @@ void IRPassManager::CreatePasses(Argument *argument,
                         AnalysisConfig::Precision::kInt8;
      pass->Set("enable_int8", new bool(enable_int8));
-      std::string model_opt_cache_dir =
-          argument->Has("model_dir")
+      bool use_static_engine = argument->tensorrt_use_static_engine();
-              ? argument->model_dir()
+      bool model_from_memory = argument->model_from_memory();
-              : GetDirRoot(argument->model_program_path());
+      if ((!model_from_memory && use_static_engine)) {
-      pass->Set(
+        std::string model_opt_cache_dir =
-          "model_opt_cache_dir",
+            argument->Has("model_dir")
-          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+                ? argument->model_dir()
+                : GetDirRoot(argument->model_program_path());
+        pass->Set(
+            "model_opt_cache_dir",
+            new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      }
      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("use_static_engine",
+      pass->Set("use_static_engine", new bool(use_static_engine));
-                new bool(argument->tensorrt_use_static_engine()));
      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
                                       argument->engine_opt_info()));

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -256,10 +256,11 @@ void AnakinSubgraphPass::CreateAnakinOp(
      input_names_with_id, output_names_with_id, std::to_string(predictor_id));
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  int max_batch_size = Get<int>("max_batch_size");
  auto *anakin_engine =
      inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
-          true, Get<int>("gpu_device_id"), engine_key);
+          true, Get<int>("gpu_device_id"), max_batch_size, engine_key);
  auto *scope = param_scope();
  std::unordered_set<std::string> param_set(params.begin(), params.end());

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -245,8 +245,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
      trt_engine_serialized_data.empty()) {
    std::copy(params.begin(), params.end(),
              std::back_inserter(*repetitive_params));
-    trt_engine_serialized_data = GetTrtEngineSerializedData(
-        Get<std::string>("model_opt_cache_dir"), engine_key);
+    if (use_static_engine && !load_from_memory) {
+      trt_engine_serialized_data = GetTrtEngineSerializedData(
+          Get<std::string>("model_opt_cache_dir"), engine_key);
+    }
    if (trt_engine_serialized_data.empty()) {
      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
@@ -267,10 +270,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
      trt_engine_serialized_data =
          std::string((const char *)serialized_engine_data->data(),
                      serialized_engine_data->size());
-      SaveTrtEngineSerializedDataToFile(
-          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+      if (use_static_engine && !load_from_memory) {
-                                     engine_key),
+        SaveTrtEngineSerializedDataToFile(
-          trt_engine_serialized_data);
+            GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                       engine_key),
+            trt_engine_serialized_data);
+      }
    } else {
      LOG(INFO) << "Load TRT Optimized Info from "
                << GetTrtEngineSerializedPath(

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -109,6 +109,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
+  CP_MEMBER(use_anakin_);
+  CP_MEMBER(anakin_max_batchsize_);
  // Ir related.
  CP_MEMBER(enable_ir_optim_);
  CP_MEMBER(use_feed_fetch_ops_);
@@ -352,7 +355,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
  ir_debug_ = x;
  Update();
 }
-void AnalysisConfig::EnableAnakinEngine() {
+void AnalysisConfig::EnableAnakinEngine(int max_batch_size) {
+  anakin_max_batchsize_ = max_batch_size;
  use_anakin_ = true;
  Update();
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -379,6 +379,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  if (config_.use_gpu() && config_.anakin_engine_enabled()) {
+    argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
    LOG(INFO) << "Anakin subgraph engine is enabled";
  }

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -145,7 +145,7 @@ struct AnalysisConfig {
  /**
   *  \brief Turn on the usage of Anakin sub-graph engine.
   */
-  void EnableAnakinEngine();
+  void EnableAnakinEngine(int max_batch_size = 1);
  /** A boolean state indicating whether the Anakin sub-graph engine is used.
  */
@@ -270,6 +270,7 @@ struct AnalysisConfig {
  mutable std::unique_ptr<PassStrategy> pass_builder_;
  bool use_anakin_{false};
+  int anakin_max_batchsize_;
  std::map<std::string, std::string> engine_opt_info_;
 };