From ecc12fb43025022e3cc35e34607874420ca397e8 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 15 Feb 2019 07:43:20 +0000
Subject: [PATCH] 3. when runing in trt mode, do not allocate memory for
 parameters in fluid. test=develop

---
 paddle/fluid/framework/ir/fuse_pass_base.h    |  5 ++
 .../ir_passes/tensorrt_subgraph_pass.cc       | 42 +++++++---
 .../ir_passes/tensorrt_subgraph_pass.h        |  7 +-
 .../ir_params_sync_among_devices_pass.cc      | 11 +++
 .../ir_params_sync_among_devices_pass.h       |  1 +
 .../inference/tensorrt/convert/op_converter.h | 62 ++++++++++++++
 .../operators/tensorrt/tensorrt_engine_op.h   | 81 +++----------------
 7 files changed, 126 insertions(+), 83 deletions(-)
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index c53b2a61867..ed3796c5ff4 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {
 
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managered by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";
 
 enum FuseOptions {
   DO_NOT_FUSE,  // fusing will not be done
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index d91f62a12f9..1da48b5d61a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,8 +14,6 @@
 
 #include <algorithm>
 #include <set>
-#include <string>
-#include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -42,7 +40,6 @@ void RenameAndGetOutputs(
     std::unordered_map<std::string, std::string> *output_name_map);
 
 std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
-
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
 
@@ -55,9 +52,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
                       Get<int>("min_subgraph_size") /*min subgraph size*/);
   fuser();
 
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in trt, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get());
+      CreateTensorRTOp(node, graph.get(), graph_param_names,
+                       &repetitive_params);
 
       std::unordered_set<const Node *> nodes2remove(
           Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
@@ -72,6 +76,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     }
   }
   framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
 
   return graph;
 }
@@ -89,8 +95,10 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
   return engine_key;
 }
 
-void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
-                                            Graph *graph) const {
+void TensorRtSubgraphPass::CreateTensorRTOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
@@ -124,10 +132,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // is unique.
   std::set<std::string> input_names;
   std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+
+  // The node->inputs containes input tensors and parameters.
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
   }
+
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
   for (auto *x : node->outputs) {
@@ -161,6 +176,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
     PADDLE_ENFORCE(output_name_map.count(name) != 0);
     output_mapping.push_back(output_name_map[name]);
   }
+  PADDLE_ENFORCE(!output_mapping.empty());
 
   auto *vars = block_desc.Proto()->mutable_vars();
   for (framework::ir::Node *node : graph->Nodes()) {
@@ -172,22 +188,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
 
+  // Set attrs
+  op_desc->SetType("tensorrt_engine");
   op_desc->SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
   op_desc->SetOutput(
       "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("tensorrt_engine");
 
-  PADDLE_ENFORCE(!output_mapping.empty());
   op_desc->SetBlockAttr("sub_block", new_block);
   SetAttr(op_desc->Proto(), "subgraph",
           block_desc.Proto()->SerializeAsString());
-  // Set attrs
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  SetAttr(op_desc->Proto(), "parameters", params);
 
   auto enable_int8 = Get<bool>("enable_int8");
   auto engine_key =
@@ -200,6 +215,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
+
+  if (!(enable_int8 && calibration_data.size() == 0)) {
+    std::copy(params.begin(), params.end(),
+              std::back_inserter(*repetitive_params));
+  }
 }
 
 std::vector<std::string> ExtractParameters(
@@ -211,7 +231,7 @@ std::vector<std::string> ExtractParameters(
   for (const auto &node : nodes) {
     if (!node->IsOp()) continue;
     std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
       std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
       std::copy(output_names.begin(), output_names.end(),
                 std::back_inserter(feed_outputs));
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index 502353b95fc..144f8bbd0e4 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
@@ -26,8 +28,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
       std::unique_ptr<framework::ir::Graph> graph) const override;
 
  private:
-  void CreateTensorRTOp(framework::ir::Node *x,
-                        framework::ir::Graph *graph) const;
+  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                        const std::vector<std::string> &graph_params,
+                        std::vector<std::string> *repetitive_params) const;
   void CleanIntermediateOutputs(framework::ir::Node *node);
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 8be2d3ac0b1..d13ec7608c3 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
 
+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+
   LOG(INFO) << "Sync params from CPU to GPU";
 
   PADDLE_ENFORCE(argument->gpu_device_id_valid());
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
   for (auto &var_name : all_vars) {
+    if (std::count(repetitive_params.begin(), repetitive_params.end(),
+                   var_name)) {
+      continue;
+    }
     auto *var = scope->FindLocalVar(var_name);
     PADDLE_ENFORCE(var != nullptr);
     if (var->IsType<framework::LoDTensor>() ||
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index a95f460df6f..61990150a30 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 91670ba8ac5..ab50758c824 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -16,9 +16,11 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
@@ -26,6 +28,37 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {  // NOLINT
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace // NOLINT
+
 /*
  * Convert Op from Fluid to TensorRT Engine.
  */
@@ -110,6 +143,35 @@ class OpConverter {
     }
   }
 
+  void ConvertBlockToTRTEngine(
+      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      const std::vector<std::string>& inputs,
+      const std::unordered_set<std::string>& parameters,
+      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+    engine->InitNetwork();
+    for (auto& input : inputs) {
+      if (parameters.count(input)) continue;
+      auto& t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
+      auto t_shape = framework::vectorize(t.dims());
+
+      auto* var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(t_shape));
+    }
+    framework::proto::BlockDesc* block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    for (auto& output : outputs) {
+      engine->DeclareOutput(output);
+    }
+    engine->FreezeNetwork();
+  }
+
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
   virtual ~OpConverter() {}
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 33bbb6f165a..dcc046648a0 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -31,37 +31,6 @@ namespace paddle {
 
 namespace operators {
 
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {  // NOLINT
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
-  if (shape.size() == 4UL)
-    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-  return nvinfer1::DimsCHW(shape[1], 1, 1);
-}
-
-}  // namespace // NOLINT
-
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
 using inference::tensorrt::TRTInt8Calibrator;
@@ -161,7 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
             new TensorRTEngine(max_batch_size_, workspace_size_, enable_int8_,
                                calib_res->calib_.get()));
         VLOG(3) << "start the calib trt engine thread";
-        Prepare(scope, calib_res->engine_.get());
+        PrepareTRTEngine(scope, calib_res->engine_.get());
       }));
     }
 
@@ -259,7 +228,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_,
                                            enable_int8_, calibrator_.get()));
       if (true) {
-        Prepare(scope, trt_engine_.get());
+        PrepareTRTEngine(scope, trt_engine_.get());
       } else {
         // create static engine
       }
@@ -267,49 +236,21 @@ class TensorRTEngineOp : public framework::OperatorBase {
     return trt_engine_.get();
   }
 
-  void Prepare(const framework::Scope &scope, TensorRTEngine *engine) const {
+  void PrepareTRTEngine(const framework::Scope &scope,
+                        TensorRTEngine *engine) const {
     LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                  "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
-    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-
-    engine->InitNetwork();
+    framework::proto::BlockDesc block_proto;
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
+    framework::BlockDesc block_desc(nullptr, &block_proto);
 
-    VLOG(4) << "parsed var size " << block.AllVars().size();
-    std::vector<std::string> output_maps =
+    std::vector<std::string> inputs = Inputs("Xs");
+    std::vector<std::string> outputs =
         Attr<std::vector<std::string>>("output_name_mapping");
 
-    // Add inputs
-    VLOG(4) << "declare inputs";
-    for (auto &input : Inputs("Xs")) {
-      if (param_names_.count(input)) continue;
-      VLOG(4) << "declare input " << input;
-
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
-      auto t_shape = framework::vectorize(t.dims());
-
-      auto *var = block.FindVar(input);
-      // TensorRT engine need to create parameters. The parameter's description
-      // should be set in
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
-      engine->DeclareInput(
-          input, FluidDataType2TRT(
-                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(t_shape));
-    }
-
     inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
-
-    // Add outputs
-    for (auto &output : output_maps) {
-      engine->DeclareOutput(output);
-    }
-    engine->FreezeNetwork();
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
   }
 };
 
-- 
GitLab