3. when runing in trt mode, do not allocate memory for parameters in fluid.

test=develop

3. when runing in trt mode, do not allocate memory for parameters in fluid.
test=develop
4f77248d · nhzlx · ceci3 · 8c171902 · 4f77248d · 4f77248d
7 changed file
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managered by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";
 enum FuseOptions {
  DO_NOT_FUSE,  // fusing will not be done

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,8 +14,6 @@
 #include <algorithm>
 #include <set>
-#include <string>
-#include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -42,7 +40,6 @@ void RenameAndGetOutputs(
    std::unordered_map<std::string, std::string> *output_name_map);
 std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
@@ -55,9 +52,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
                      Get<int>("min_subgraph_size") /*min subgraph size*/);
  fuser();
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in trt, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get());
+      CreateTensorRTOp(node, graph.get(), graph_param_names,
+                       &repetitive_params);
      std::unordered_set<const Node *> nodes2remove(
          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
@@ -72,6 +76,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    }
  }
  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
  return graph;
 }
@@ -89,8 +95,10 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
  return engine_key;
 }
-void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
+void TensorRtSubgraphPass::CreateTensorRTOp(
-                                            Graph *graph) const {
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
  auto *op_desc = node->Op();
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
@@ -124,10 +132,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // is unique.
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  // The node->inputs containes input tensors and parameters.
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
  }
  std::set<std::string> output_names;
  std::set<std::string> output_names_with_id;
  for (auto *x : node->outputs) {
@@ -161,6 +176,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
  }
+  PADDLE_ENFORCE(!output_mapping.empty());
  auto *vars = block_desc.Proto()->mutable_vars();
  for (framework::ir::Node *node : graph->Nodes()) {
@@ -172,22 +188,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
+  // Set attrs
+  op_desc->SetType("tensorrt_engine");
  op_desc->SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
  op_desc->SetOutput(
      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("tensorrt_engine");
-  PADDLE_ENFORCE(!output_mapping.empty());
  op_desc->SetBlockAttr("sub_block", new_block);
  SetAttr(op_desc->Proto(), "subgraph",
          block_desc.Proto()->SerializeAsString());
-  // Set attrs
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  SetAttr(op_desc->Proto(), "parameters", params);
  auto enable_int8 = Get<bool>("enable_int8");
  auto engine_key =
@@ -200,6 +215,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  if (!(enable_int8 && calibration_data.size() == 0)) {
+    std::copy(params.begin(), params.end(),
+              std::back_inserter(*repetitive_params));
+  }
 }
 std::vector<std::string> ExtractParameters(
@@ -211,7 +231,7 @@ std::vector<std::string> ExtractParameters(
  for (const auto &node : nodes) {
    if (!node->IsOp()) continue;
    std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
      std::copy(output_names.begin(), output_names.end(),
                std::back_inserter(feed_outputs));

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -14,6 +14,8 @@
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
@@ -26,8 +28,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
      std::unique_ptr<framework::ir::Graph> graph) const override;
 private:
-  void CreateTensorRTOp(framework::ir::Node *x,
+  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
-                        framework::ir::Graph *graph) const;
+                        const std::vector<std::string> &graph_params,
+                        std::vector<std::string> *repetitive_params) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
 };

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
  if (!argument->use_gpu()) return;
+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
  LOG(INFO) << "Sync params from CPU to GPU";
  PADDLE_ENFORCE(argument->gpu_device_id_valid());
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // Because there exists the case that new parameter variables are not added to
  // the program in the analysis pass.
  for (auto &var_name : all_vars) {
+    if (std::count(repetitive_params.begin(), repetitive_params.end(),
+                   var_name)) {
+      continue;
+    }
    auto *var = scope->FindLocalVar(var_name);
    PADDLE_ENFORCE(var != nullptr);
    if (var->IsType<framework::LoDTensor>() ||

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -16,9 +16,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -26,6 +28,37 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+namespace {  // NOLINT
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+}  // namespace // NOLINT
 /*
 * Convert Op from Fluid to TensorRT Engine.
 */
@@ -110,6 +143,35 @@ class OpConverter {
    }
  }
+  void ConvertBlockToTRTEngine(
+      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      const std::vector<std::string>& inputs,
+      const std::unordered_set<std::string>& parameters,
+      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+    engine->InitNetwork();
+    for (auto& input : inputs) {
+      if (parameters.count(input)) continue;
+      auto& t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
+      auto t_shape = framework::vectorize(t.dims());
+      auto* var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(t_shape));
+    }
+    framework::proto::BlockDesc* block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    for (auto& output : outputs) {
+      engine->DeclareOutput(output);
+    }
+    engine->FreezeNetwork();
+  }
  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
  virtual ~OpConverter() {}

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -31,37 +31,6 @@ namespace paddle {
 namespace operators {
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-namespace {  // NOLINT
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
-  if (shape.size() == 4UL)
-    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-  return nvinfer1::DimsCHW(shape[1], 1, 1);
-}
-}  // namespace // NOLINT
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
 using inference::tensorrt::TRTInt8Calibrator;
@@ -161,7 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
            new TensorRTEngine(max_batch_size_, workspace_size_, enable_int8_,
                               calib_res->calib_.get()));
        VLOG(3) << "start the calib trt engine thread";
-        Prepare(scope, calib_res->engine_.get());
+        PrepareTRTEngine(scope, calib_res->engine_.get());
      }));
    }
@@ -259,7 +228,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
      trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_,
                                           enable_int8_, calibrator_.get()));
      if (true) {
-        Prepare(scope, trt_engine_.get());
+        PrepareTRTEngine(scope, trt_engine_.get());
      } else {
        // create static engine
      }
@@ -267,49 +236,21 @@ class TensorRTEngineOp : public framework::OperatorBase {
    return trt_engine_.get();
  }
-  void Prepare(const framework::Scope &scope, TensorRTEngine *engine) const {
+  void PrepareTRTEngine(const framework::Scope &scope,
+                        TensorRTEngine *engine) const {
    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
+    framework::proto::BlockDesc block_proto;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
-    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+    framework::BlockDesc block_desc(nullptr, &block_proto);
-    engine->InitNetwork();
-    VLOG(4) << "parsed var size " << block.AllVars().size();
+    std::vector<std::string> inputs = Inputs("Xs");
-    std::vector<std::string> output_maps =
+    std::vector<std::string> outputs =
        Attr<std::vector<std::string>>("output_name_mapping");
-    // Add inputs
-    VLOG(4) << "declare inputs";
-    for (auto &input : Inputs("Xs")) {
-      if (param_names_.count(input)) continue;
-      VLOG(4) << "declare input " << input;
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
-      auto t_shape = framework::vectorize(t.dims());
-      auto *var = block.FindVar(input);
-      // TensorRT engine need to create parameters. The parameter's description
-      // should be set in
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
-      engine->DeclareInput(
-          input, FluidDataType2TRT(
-                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(t_shape));
-    }
    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
-    // Add outputs
-    for (auto &output : output_maps) {
-      engine->DeclareOutput(output);
-    }
-    engine->FreezeNetwork();
  }
 };