tensorrt_subgraph_pass.cc 35.3 KB
Newer Older
1

2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
17

18
#include <fcntl.h>
19
#include <cstddef>
20
#include <memory>
21 22
#include <string>
#include <unordered_set>
23

24 25
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
26
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
27
#include "paddle/fluid/framework/ir/node.h"
28
#include "paddle/fluid/framework/ir/subgraph_detector.h"
29
#include "paddle/fluid/framework/op_version_registry.h"
30
#include "paddle/fluid/inference/analysis/helper.h"
31
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
32
#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
33
#include "paddle/fluid/inference/api/helper.h"
N
nhzlx 已提交
34 35
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
36
#include "paddle/fluid/inference/tensorrt/op_teller.h"
37
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
38
#include "paddle/fluid/inference/utils/io_utils.h"
39 40
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h"
41 42 43 44

namespace paddle {
namespace inference {
namespace analysis {
45 46 47 48 49 50 51 52
namespace {

// if in mixed model precision, we should make all tensorrt_engine's output
// floats dtype to float32 dtype.
void OutputProcess(framework::ir::Graph *graph,
                   const std::unordered_set<framework::ir::Node *> &trt_outputs,
                   phi::Backend backend,
                   phi::DataType precision,
53 54
                   const std::unordered_set<std::string> &blacklist,
                   const std::unordered_set<std::string> &whitelist) {
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
  framework::BlockDesc *block_desc{nullptr};
  int suffix = 0;
  std::unordered_map<framework::ir::Node *, framework::ir::Node *>
      var_to_cast_op_map;

  framework::proto::VarType::Type to_type;
  if (precision == phi::DataType::FLOAT16) {
    to_type = framework::proto::VarType::FP16;
  } else if (precision == phi::DataType::BFLOAT16) {
    to_type = framework::proto::VarType::BF16;
  } else if (precision == phi::DataType::FLOAT32) {
    return;
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "mixed_precision currently not supported dtype %d, we now only support "
        "fp16 and bf16.",
        static_cast<int>(precision)));
  }

  for (auto *op_node : framework::ir::TopologySortOperations(*graph)) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type == "feed") block_desc = op_node->Op()->Block();
    if (op_type != "tensorrt_engine") continue;
    for (auto *var_node : op_node->outputs) {
      if (!trt_outputs.count(var_node)) continue;
      if (!var_node->Var()->Persistable() &&
82
          IsFloatVar(var_node->Var()->GetDataType()) &&
83 84 85 86 87 88 89
          var_node->Var()->GetDataType() != framework::proto::VarType::FP32) {
        for (auto *next_op : var_node->outputs) {
          // if next_op support mixed_precision, we need to add cast op.
          if (OpSupportPrecision(
                  phi::TransToPhiKernelName(next_op->Op()->Type()),
                  backend,
                  precision,
90 91
                  blacklist,
                  whitelist)) {
92 93 94 95 96 97 98 99
            InsertCastOp(graph,
                         var_node,
                         next_op,
                         framework::proto::VarType::FP32,
                         to_type,
                         block_desc,
                         &suffix,
                         &var_to_cast_op_map);
100 101 102 103 104 105 106 107
            var_node->Var()->SetDataType(framework::proto::VarType::FP32);
          }
        }
      }
    }
  }
}

W
Wilber 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
// Determine whether the whole graph offload to tensorrt. If so we can try to
// enable optimization such as cudaGraph.
bool AllNodesLowerToTrtPostProcess(framework::ir::Graph *graph) {
  std::unordered_set<std::string> trt_nodes_set{
      "feed", "fetch", "tensorrt_engine"};
  bool all_nodes_offload_to_trt = true;
  for (auto *node : graph->Nodes()) {
    if (node->IsOp()) {
      if (!trt_nodes_set.count(node->Op()->Type())) {
        all_nodes_offload_to_trt = false;
        break;
      }
    }
  }
  return all_nodes_offload_to_trt;
}
124
}  // namespace
125 126 127

using framework::ir::Node;

128 129 130
void analysis::TensorRtSubgraphPass::ApplyImpl(
    framework::ir::Graph *graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
131 132 133 134 135 136 137 138 139

  auto model_precision =
      static_cast<phi::DataType>(Get<int>("model_precision"));
  if (model_precision == phi::DataType::BFLOAT16) {
    LOG(WARNING)
        << "Paddle-TRT not support bf16 mixed precison, just fallback.";
    return;
  }

140 141
  auto enable_int8 = Get<bool>("enable_int8");
  auto use_calib_mode = Get<bool>("use_calib_mode");
W
Wilber 已提交
142
  bool use_cuda_graph = Get<bool>("use_cuda_graph");
143
  bool no_calib_int8 = enable_int8 && !(use_calib_mode);
144
  auto trt_disabled_ops = Get<std::vector<std::string>>("trt_disabled_ops");
145
  auto with_dynamic_shape = Get<bool>("with_dynamic_shape");
146
  auto teller = [&](const framework::ir::Node *node) {
147
    if (!node->IsOp() || !node->Op()) return false;
148 149
    if (find(trt_disabled_ops.begin(),
             trt_disabled_ops.end(),
150 151 152 153 154
             node->Op()->Type()) != trt_disabled_ops.end()) {
      VLOG(3) << node->Op()->Type().c_str()
              << " is diabled by config in TensorRT";
      return false;
    }
155 156 157 158 159 160 161 162 163 164
    for (const auto &out_var : node->Op()->OutputNames()) {
      for (const auto &var_name : node->Op()->Output(out_var)) {
        if (find(trt_disabled_ops.begin(), trt_disabled_ops.end(), var_name) !=
            trt_disabled_ops.end()) {
          VLOG(3) << node->Op()->Type().c_str()
                  << " is diabled by config in TensorRT";
          return false;
        }
      }
    }
165 166
    bool is_ok = tensorrt::OpTeller::Global().Tell(
        node, no_calib_int8, with_dynamic_shape);
167 168 169
    if (!is_ok)
      VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT";
    return is_ok;
170
  };
171

172
  framework::ir::SubGraphFuser fuser(
173 174 175
      graph,
      teller,
      Get<int>("min_subgraph_size") /*min subgraph size*/,
176
      "tensorrt_engine");
177 178
  fuser();

179 180 181 182 183
  std::vector<std::string> graph_param_names =
      ExtractParameters(graph->Nodes());
  // those parameter already exist in trt, and should not have another copy in
  // fluid.
  std::vector<std::string> repetitive_params;
W
Wilber 已提交
184
  std::vector<std::string> engine_names;
185
  for (auto *node : graph->Nodes()) {
186
    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
W
Wilber 已提交
187 188
      engine_names.push_back(CreateTensorRTOp(
          node, graph, graph_param_names, &repetitive_params, use_cuda_graph));
189 190 191 192 193
    }
  }

  std::unordered_set<const Node *> nodes2remove;
  for (auto *node : graph->Nodes()) {
194
    if (node->IsOp() && framework::ir::Agent(node).deleted()) {
195 196 197
      nodes2remove.insert(node);
    }
  }
198
  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
199 200
  graph->Set(framework::ir::kRepetitiveParamAttr,
             new std::vector<std::string>(repetitive_params));
W
Wilber 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226

  bool all_nodes_offload_to_trt = AllNodesLowerToTrtPostProcess(graph);
  if (all_nodes_offload_to_trt) {
    LOG(INFO) << "The entire graph is offloaded to TensorRT.";
  }
  if (use_cuda_graph && !all_nodes_offload_to_trt) {
    LOG_FIRST_N(WARNING, 1)
        << "You have enabled CudaGraph, but not the entire graph offload to "
           "trt, now return to normal mode.";
    use_cuda_graph = false;
  }
  if (use_cuda_graph && all_nodes_offload_to_trt) {
    for (auto &name : engine_names) {
      PADDLE_ENFORCE_EQ(
          paddle::inference::Singleton<
              inference::tensorrt::TRTEngineManager>::Global()
              .Has(name),
          true,
          platform::errors::PreconditionNotMet(
              "TRTEnegineManager shoud has engine %s, but not found.", name));
      paddle::inference::Singleton<
          inference::tensorrt::TRTEngineManager>::Global()
          .Get(name)
          ->SetAllNodesLowerToTrt(use_cuda_graph);
    }
  }
227 228 229 230 231 232 233 234 235 236 237 238 239 240

  // some ops are only implemented in paddle-trt,
  // but not in paddle ,we should revert it.
  for (auto *op_node : framework::ir::TopologyVarientSort(
           *graph, static_cast<framework::ir::SortKind>(0))) {
    if (op_node->Op()->Type() == "matrix_multiply") {
      auto origin_type =
          op_node->Op()->GetAttrIfExists<std::string>("original_type");
      LOG(WARNING) << "matrix_multiply can't enter into paddle-trt,"
                   << "we will revert to " << origin_type;
      op_node->Op()->SetType(origin_type);
      op_node->RenameOp(origin_type);
    }
  }
241 242
}

N
nhzlx 已提交
243
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
N
nhzlx 已提交
244
                              const std::set<std::string> &engine_outputs,
245 246 247
                              const std::string &predictor_id,
                              const std::string &max_batch_size,
                              const std::string &precision,
W
Wilber 已提交
248
                              bool use_cuda_graph,
249
                              const bool for_calibration) {
N
nhzlx 已提交
250 251 252
  std::string engine_hash_key = "";
  for (auto name : engine_inputs) {
    engine_hash_key += name;
253
    engine_hash_key += "#";
N
nhzlx 已提交
254 255 256
  }
  for (auto name : engine_outputs) {
    engine_hash_key += name;
257
    engine_hash_key += "#";
N
nhzlx 已提交
258
  }
N
nhzlx 已提交
259
  engine_hash_key += predictor_id;
260 261 262 263
  if (!for_calibration) {
    engine_hash_key += "#";
    engine_hash_key += max_batch_size;
  }
264 265
  engine_hash_key += "#";
  engine_hash_key += precision;
266

W
Wilber 已提交
267 268 269
  engine_hash_key += "#";
  engine_hash_key += use_cuda_graph;

N
nhzlx 已提交
270
  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
271 272
  VLOG(2) << "TRT engine hash key: " << engine_hash_key;
  VLOG(2) << "TRT engine key: " << engine_key;
N
nhzlx 已提交
273 274 275
  return engine_key;
}

W
Wilber 已提交
276
std::string TensorRtSubgraphPass::CreateTensorRTOp(
277 278
    framework::ir::Node *node,
    framework::ir::Graph *graph,
279
    const std::vector<std::string> &graph_params,
W
Wilber 已提交
280 281
    std::vector<std::string> *repetitive_params,
    bool use_cuda_graph) const {
282
  auto *op_desc = node->Op();
283
  auto &subgraph = *framework::ir::Agent(node).subgraph();
284 285
  PADDLE_ENFORCE_EQ(subgraph.empty(),
                    false,
286 287
                    platform::errors::PreconditionNotMet(
                        "The subgraph should not be empty."));
288

N
nhzlx 已提交
289 290 291 292 293 294 295
  framework::ProgramDesc *program_desc =
      Get<framework::ProgramDesc *>("program");
  // Add new block for TensorRTEngineOP
  const framework::BlockDesc &main_block =
      program_desc->Block(framework::kRootBlockIndex);
  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);

296
  // A fake block desc.
297 298 299 300
  framework::proto::BlockDesc block_proto;
  framework::BlockDesc block_desc(nullptr, &block_proto);
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
301
  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
302 303 304 305 306
  for (auto node : subgraph) {
    if (node->NodeType() == Node::Type::kOperation) {
      VLOG(5) << "trt subgraph has op: " << (node->Op()->Type());
    }
  }
Y
Yan Chunwei 已提交
307

308
  for (auto *node : subgraph) {
N
nhzlx 已提交
309
    auto *new_block_op = new_block->AppendOp();
310
    auto *op = block_desc.AppendOp();
N
nhzlx 已提交
311
    *new_block_op->Proto() = *node->Op()->Proto();
312 313 314
    *op->Proto() = *node->Op()->Proto();
  }

N
nhzlx 已提交
315
  // Then, we will use the input_names_with_id and output_names_with_id to
316
  // generate the engine key.
N
nhzlx 已提交
317 318
  // So, We use set instead of unordered_set here to ensure that the engine key
  // is unique.
N
nhzlx 已提交
319 320
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
321 322 323
  std::vector<std::string> parameters;
  // if we delete fluid copy of parameters shared by more than 1 ops, there will
  // be problem, so we filter them out.
324
  std::vector<std::string> params_not_shared;
325

326
  auto *scope = param_scope();
327
  // The node->inputs contains input tensors and parameters.
328 329 330
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
331
    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
332
      parameters.push_back(x->Name());
333
    }
334 335 336 337
    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
        x->outputs.size() <= 1) {
      params_not_shared.push_back(x->Name());
    }
M
ming1753 已提交
338 339 340
    // When TRT Engine's input is INT64 or FP64, we need do some extra work.
    // So we reserved a name for later use when casting INT64 -> INT32 or
    // FP64->FP32. We must check whether scope has had the same name var!
341 342
    if (x->Var()->GetDataType() == framework::proto::VarType::INT64) {
      LOG(WARNING)
Y
Yuanle Liu 已提交
343
          << "tensorrt_subgraph's input named " << x->Name()
344 345
          << " having int64 dtype in pdmodel description, we will cast them to "
             "int32 dtype to feed them into paddle-trt.";
M
ming1753 已提交
346 347 348 349 350
    } else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) {
      LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name()
                   << " having float64 dtype in pdmodel description, we will "
                      "cast them to "
                      "float32 dtype to feed them into paddle-trt.";
351
    }
352
  }
353

354 355 356 357 358 359 360 361 362 363
  // var may have the same name but not have the same id.
  // e.g., var(batch_norm2d_0.w_1) may have id: 10, 13, 25.... in a graph.
  // so we must find all the var_name+id.
  // https://github.com/PaddlePaddle/Paddle/pull/53184
  for (auto *n : graph->Nodes()) {
    if (n->IsVar() && input_names.count(n->Name())) {
      input_names_with_id.insert(n->Name() + std::to_string(n->id()));
    }
  }

364 365 366 367
  auto model_precision =
      static_cast<phi::DataType>(Get<int>("model_precision"));
  auto mixed_black_list =
      Get<std::unordered_set<std::string>>("mixed_black_list");
368 369
  auto mixed_white_list =
      Get<std::unordered_set<std::string>>("mixed_white_list");
370

N
nhzlx 已提交
371 372
  std::set<std::string> output_names;
  std::set<std::string> output_names_with_id;
373
  std::map<std::string, int> origin_name_output_rank;
374
  std::unordered_set<Node *> trt_outputs;
375 376 377
  // record the origin output data type
  std::vector<int> origin_outputs_dtype;
  std::map<std::string, int> map_origin_outputs_dtype;
M
ming1753 已提交
378

379
  // Mark TensorRT output nodes as trt outputs
M
ming1753 已提交
380 381 382
  auto mark_output = Get<bool>("mark_output");
  auto output_tensor_name =
      Get<std::vector<std::string>>("output_tensor_names");
383
  auto mark_output_with_id = Get<bool>("mark_output_with_id");
M
ming1753 已提交
384

385
  if (mark_output) {
M
ming1753 已提交
386 387 388 389 390 391
    VLOG(1) << "begin to mark output ...";
    for (auto node : subgraph) {
      if (node->NodeType() == Node::Type::kOperation) {
        for (auto *x : node->outputs) {
          if (std::count(parameters.begin(), parameters.end(), x->Name()) > 0)
            continue;
392 393 394 395 396 397 398 399 400
          std::string name_with_id = x->Name() + std::to_string(x->id());
          if (((!mark_output_with_id && std::count(output_tensor_name.begin(),
                                                   output_tensor_name.end(),
                                                   x->Name()) > 0) ||
               (mark_output_with_id && std::count(output_tensor_name.begin(),
                                                  output_tensor_name.end(),
                                                  name_with_id) > 0)) &&
              !x->outputs.empty()) {
            VLOG(3) << "output " << x->Name() << " has been marked";
M
ming1753 已提交
401
            output_names.insert(x->Name());
402
            output_names_with_id.insert(name_with_id);
M
ming1753 已提交
403 404 405 406 407 408 409 410 411 412
            origin_name_output_rank[x->Name()] = x->Var()->GetShape().size();
            trt_outputs.insert(x);
            map_origin_outputs_dtype[x->Name()] =
                static_cast<int>(x->Var()->GetDataType());
          }
        }
      }
    }
  }

413 414 415
  for (auto *x : node->outputs) {
    output_names.insert(x->Name());
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
416
    origin_name_output_rank[x->Name()] = x->Var()->GetShape().size();
417
    trt_outputs.insert(x);
418 419
    map_origin_outputs_dtype[x->Name()] =
        static_cast<int>(x->Var()->GetDataType());
420 421
  }

422 423 424 425 426 427
  OutputProcess(graph,
                trt_outputs,
                phi::Backend::GPU,
                model_precision,
                mixed_black_list,
                mixed_white_list);
428

429
  std::unordered_map<std::string, std::string> output_name_map;
430 431 432 433 434 435 436
  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;

  for (framework::ir::Node *node : graph->Nodes()) {
    if (node->IsVar() && node->Var()) {
      graph_var_map[node->Name()] = node;
    }
  }
437 438
  auto precision_mode =
      static_cast<phi::DataType>(Get<int>("trt_precision_mode"));
Z
Zhaolong Xing 已提交
439
  bool enable_fp16 = false;
440
  if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true;
441 442
  auto enable_int8 = Get<bool>("enable_int8");
  auto use_calib_mode = Get<bool>("use_calib_mode");
443
  auto &subgraph_nodes = *framework::ir::Agent(node).subgraph();
444 445 446 447
  auto min_input_shape =
      Get<std::map<std::string, std::vector<int>>>("min_input_shape");
  auto max_input_shape =
      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
448
  auto optim_input_shape =
449
      Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
450

451 452 453 454
  auto min_shape_tensor =
      Get<std::map<std::string, std::vector<int>>>("min_shape_tensor");
  auto max_shape_tensor =
      Get<std::map<std::string, std::vector<int>>>("max_shape_tensor");
455
  auto optim_shape_tensor =
456 457
      Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor");

458
  auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
459
  auto with_dynamic_shape = Get<bool>("with_dynamic_shape");
460 461 462 463
  auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
  auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
  int max_batch_size = Get<int>("max_batch_size");
  if (trt_tuned_dynamic_shape) {
464 465 466 467 468
    if (!shape_range_info_path.empty()) {
      VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
      inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                           &min_input_shape,
                                           &max_input_shape,
469
                                           &optim_input_shape,
470 471
                                           &min_shape_tensor,
                                           &max_shape_tensor,
472
                                           &optim_shape_tensor);
473 474 475 476 477 478 479 480 481
    } else {
      shape_range_info_path =
          Get<std::string>("model_opt_cache_dir") + "shape_range_info.pbtxt";
      if (open(shape_range_info_path.c_str(), O_RDONLY) != -1) {
        VLOG(1) << "trt dynamic_shape deserialize from "
                << shape_range_info_path;
        inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                             &min_input_shape,
                                             &max_input_shape,
482
                                             &optim_input_shape,
483 484
                                             &min_shape_tensor,
                                             &max_shape_tensor,
485
                                             &optim_shape_tensor);
486
      } else {
487
        int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644);
488 489 490
        close(fd);
      }
    }
491 492
  }

493 494 495 496 497 498 499 500 501 502 503 504
  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
  // Why we do this?
  // During the transition from fluid OP to tensorrt OP, we map
  // the input and output Tensor(fluid data structure) of fluid OP
  // to the corresponding ITensor (trt data structure) through the
  // Tensor name. When we set up ITensor for an variable, we must
  // ensure that it has not been set before.
  // If there is variable in the fluid graph, which is not only the
  // input of a OP, but also the output of a Op, there will be problems.
  // So we have to rename the variable in the subgraph to make sure
  // it is either an OP's input or an OP's output.
505 506 507 508 509 510 511 512
  RenameAndGetOutputs(subgraph_nodes,
                      &block_desc,
                      input_names_with_id,
                      &output_names_with_id,
                      &output_names,
                      &output_name_map,
                      graph_var_map,
                      !enable_int8);
513 514 515 516 517

  // When tensorrt engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
  // to Tensor.
  std::vector<std::string> output_mapping;
518
  std::vector<int> renamed_output_rank;
519
  for (auto name : output_names) {
520 521
    PADDLE_ENFORCE_NE(output_name_map.count(name),
                      0,
522 523
                      platform::errors::PreconditionNotMet(
                          "The output_name_map should have %s", name));
524
    output_mapping.push_back(output_name_map[name]);
525 526 527
    renamed_output_rank.push_back(origin_name_output_rank[name]);
    origin_outputs_dtype.push_back(map_origin_outputs_dtype[name]);

M
ming1753 已提交
528 529 530
    // When TRT Engine's output is INT64 or FP64, we need do some extra work.
    // So we reserved a name for later use when casting INT32 -> INT64 or FP32
    // -> FP64. We must check whether scope has had the same name var!
531 532 533
    if (static_cast<framework::proto::VarType_Type>(
            map_origin_outputs_dtype[name]) ==
        framework::proto::VarType::INT64) {
Y
Yuanle Liu 已提交
534
      LOG(WARNING) << "tensorrt_subgraph's output named " << name
535 536 537 538
                   << " having int64 dtype in pdmodel description, but in fact "
                      "it is int32 "
                      "dtype after executing this tensorrt_subgraph, so we "
                      "need cast them into int64.";
M
ming1753 已提交
539 540 541 542 543 544 545 546 547
    } else if (static_cast<framework::proto::VarType_Type>(
                   map_origin_outputs_dtype[name]) ==
               framework::proto::VarType::FP64) {
      LOG(WARNING)
          << "tensorrt_subgraph's output named " << name
          << " having float64 dtype in pdmodel description, but in fact "
             "it is float32 "
             "dtype after executing this tensorrt_subgraph, so we "
             "need cast them into float64.";
548
    }
549
  }
550 551
  PADDLE_ENFORCE_EQ(output_mapping.empty(),
                    false,
552 553 554
                    platform::errors::PreconditionNotMet(
                        "The output_mapping should not be empty."));
  PADDLE_ENFORCE_EQ(
555 556
      !block_desc.Proto()->vars().empty(),
      true,
557
      platform::errors::PreconditionNotMet("the block has no var-desc"));
N
nhzlx 已提交
558

559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
  // Get pass attrs.
  auto use_varseqlen = Get<bool>("use_varseqlen");
  auto with_interleaved = Get<bool>("with_interleaved");
  auto tensorrt_transformer_posid =
      Get<std::string>("tensorrt_transformer_posid");
  auto tensorrt_transformer_maskid =
      Get<std::string>("tensorrt_transformer_maskid");
  auto use_dla = Get<bool>("trt_use_dla");
  auto dla_core = Get<int>("trt_dla_core");
  auto use_inspector = Get<bool>("use_inspector");
  auto disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
  auto context_memory_sharing = Get<bool>("context_memory_sharing");
  auto enable_low_precision_io = Get<bool>("enable_low_precision_io");
  auto workspace_size = Get<int64_t>("workspace_size");
  auto gpu_device_id = Get<int>("gpu_device_id");

  // Set op's attrs.
576
  op_desc->SetType("tensorrt_engine");
N
nhzlx 已提交
577 578 579 580
  op_desc->SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
  op_desc->SetOutput(
      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
N
nhzlx 已提交
581
  op_desc->SetBlockAttr("sub_block", new_block);
582
  op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
583
  op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype);
584
  op_desc->SetAttr("max_batch_size", max_batch_size);
585 586
  op_desc->SetAttr("workspace_size", workspace_size);
  op_desc->SetAttr("gpu_device_id", gpu_device_id);
587
  op_desc->SetAttr("output_name_mapping", output_mapping);
588
  op_desc->SetAttr("origin_output_rank", renamed_output_rank);
589
  op_desc->SetAttr("parameters", parameters);
590 591
  op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
  op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
592
  op_desc->SetAttr("use_inspector", use_inspector);
593
  op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
  op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io);

  if (!trt_tuned_dynamic_shape) {
    std::vector<std::string> dynamic_shape_names;
    std::vector<int> dynamic_shape_lens;
    std::vector<int> min_input_shape_vector;
    std::vector<int> max_input_shape_vector;
    std::vector<int> opt_input_shape_vector;
    for (const auto &it : min_input_shape) {
      dynamic_shape_names.push_back(it.first);
      dynamic_shape_lens.push_back(it.second.size());
      for (const auto &value : it.second) {
        min_input_shape_vector.push_back(value);
      }
    }
    for (const auto &it : max_input_shape) {
      for (const auto &value : it.second) {
        max_input_shape_vector.push_back(value);
      }
    }
    for (const auto &it : optim_input_shape) {
      for (const auto &value : it.second) {
        opt_input_shape_vector.push_back(value);
      }
    }

    op_desc->SetAttr("dynamic_shape_names", dynamic_shape_names);
    op_desc->SetAttr("dynamic_shape_lens", dynamic_shape_lens);
    op_desc->SetAttr("min_input_shape_vector", min_input_shape_vector);
    op_desc->SetAttr("max_input_shape_vector", max_input_shape_vector);
    op_desc->SetAttr("opt_input_shape_vector", opt_input_shape_vector);
  }
N
nhzlx 已提交
626

627 628 629 630 631
  // we record all inputs' shapes in attr to check if they are consistent
  // with the real inputs' shapes retrieved from scope when trt runs.
  for (auto *x : node->inputs) {
    if (x->IsVar() && x->Var()) {
      framework::VarDesc *var = x->Var();
632
      op_desc->SetAttr(var->Name() + "_shape", var->GetShape());
633 634 635
    }
  }

636
  auto use_static_engine = Get<bool>("use_static_engine");
637 638 639 640 641
  op_desc->SetAttr("use_static_engine", use_static_engine);
  if (use_static_engine)
    op_desc->SetAttr("model_opt_cache_dir",
                     Get<std::string>("model_opt_cache_dir"));

642 643
  // TODO(NHZlX)
  // There are models with the same structure but the different parameters,
T
tianshuo78520a 已提交
644
  // when running in the 'use_serialize' mode, there is a bug.
645
  // serialization is affected by max_batch_size, but calibration is not.
646
  // So we use separate engine keys in serialization and calibration.
647 648 649 650 651 652
  auto engine_key =
      GenerateEngineKey(input_names_with_id,
                        output_names_with_id,
                        std::to_string(0),
                        std::to_string(max_batch_size),
                        std::to_string(static_cast<int>(precision_mode)),
W
Wilber 已提交
653
                        use_cuda_graph,
654
                        false);
655
  auto calibration_engine_key =
656 657 658 659 660
      GenerateEngineKey(input_names_with_id,
                        output_names_with_id,
                        std::to_string(0),
                        std::to_string(max_batch_size),
                        std::to_string(static_cast<int>(precision_mode)),
W
Wilber 已提交
661
                        use_cuda_graph,
662
                        true);
663
  auto predictor_id = Get<int>("predictor_id");
N
nhzlx 已提交
664

N
nhzlx 已提交
665
  // Get "" when there is no cached calibration table data.
666
  std::string calibration_data = "";
667
  if (enable_int8 && use_calib_mode) {
668 669
    calibration_data =
        GetTrtCalibTableData(Get<std::string>("model_opt_cache_dir"),
670 671
                             calibration_engine_key,
                             enable_int8);
672
  }
673 674 675 676 677
  op_desc->SetAttr("calibration_data", calibration_data);
  op_desc->SetAttr("enable_int8", enable_int8);
  op_desc->SetAttr("enable_fp16", enable_fp16);
  op_desc->SetAttr("use_calib_mode", use_calib_mode);
  op_desc->SetAttr("engine_key", engine_key);
678
  op_desc->SetAttr("calibration_engine_key", calibration_engine_key);
679
  op_desc->SetAttr("predictor_id", predictor_id);
680 681 682 683 684 685 686
  op_desc->SetAttr("use_varseqlen", use_varseqlen);
  op_desc->SetAttr("with_interleaved", with_interleaved);
  op_desc->SetAttr("use_dla", use_dla);
  op_desc->SetAttr("dla_core", dla_core);
  op_desc->SetAttr("disable_trt_plugin_fp16", disable_trt_plugin_fp16);
  op_desc->SetAttr("context_memory_sharing", context_memory_sharing);
  std::string trt_engine_serialized_data;
687 688
  op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);
  op_desc->Flush();
N
nhzlx 已提交
689 690

  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
691
  if (enable_int8 && !calibration_data.empty()) {
692 693
    calibrator =
        std::make_unique<tensorrt::TRTInt8Calibrator>(calibration_data);
694
    LOG(INFO) << "RUN Paddle TRT int8 calibration mode...";
N
nhzlx 已提交
695 696 697
  }
  // When in int8 mode and calibration_mode, the program just produce the
  // calibration table data.
698
  bool calibration_mode =
699
      (enable_int8 && calibration_data.empty() && use_calib_mode);
N
nhzlx 已提交
700 701
  if (calibration_mode) {
    // calibraion mode means generate int8 calibration table data process.
W
Wilber 已提交
702
    return calibration_engine_key;
N
nhzlx 已提交
703
  }
N
nhzlx 已提交
704

705 706
  std::copy(params_not_shared.begin(),
            params_not_shared.end(),
N
nhzlx 已提交
707 708
            std::back_inserter(*repetitive_params));

709 710
  // Check trt version for dynamic shape input.

711
  if (!min_input_shape.empty() && TRT_VERSION < 6000) {
712 713 714 715 716 717
    LOG_FIRST_N(WARNING, 1) << "You are using the dynamic size input mode of "
                               "Paddle-TRT, but we found that the version of "
                               "the TensorRT is less than 6.0, so we use the "
                               "static shape mode instead.";
    min_input_shape = {};
    max_input_shape = {};
718
    optim_input_shape = {};
719 720
  }

721 722 723 724
  const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
  const float trt_runtime_version =
      tensorrt::TrtMajorVersion(tensorrt::GetInferLibVersion());
  if (trt_compile_version != trt_runtime_version) {
725
    LOG_FIRST_N(WARNING, 1)
726
        << "The Paddle Inference library is compiled with "
727 728
        << trt_compile_version << " version TensorRT, "
        << "but the runtime TensorRT you are using is " << trt_runtime_version
729 730 731
        << " version. "
           "This might cause serious compatibility issues. We strongly "
           "recommend using the same TRT version at runtime.";
732 733
  }

W
Wilber 已提交
734 735 736 737 738
  std::unordered_set<const Node *> nodes2remove(
      framework::ir::Agent(node).subgraph()->begin(),
      framework::ir::Agent(node).subgraph()->end());
  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);

739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
  tensorrt::TensorRTEngine::ConstructionParams params;
  params.max_batch_size = max_batch_size;
  params.max_workspace_size = workspace_size;
  params.calibrator = calibrator.get();
  params.device_id = gpu_device_id;
  params.with_dynamic_shape = with_dynamic_shape;
  params.min_input_shape = min_input_shape;
  params.max_input_shape = max_input_shape;
  params.optim_input_shape = optim_input_shape;
  params.min_shape_tensor = min_shape_tensor;
  params.max_shape_tensor = max_shape_tensor;
  params.optim_shape_tensor = optim_shape_tensor;
  params.disable_trt_plugin_fp16 = disable_trt_plugin_fp16;
  params.precision = precision_mode;
  params.use_varseqlen = use_varseqlen;
  params.use_dla = use_dla;
  params.dla_core = dla_core;
  params.with_interleaved = with_interleaved;
  params.tensorrt_transformer_posid = tensorrt_transformer_posid;
  params.tensorrt_transformer_maskid = tensorrt_transformer_maskid;
  params.context_memory_sharing = context_memory_sharing;
  params.use_inspector = use_inspector;
  params.enable_low_precision_io = enable_low_precision_io;

763 764
  tensorrt::TensorRTEngine *trt_engine =
      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
765
          .Create(engine_key + std::to_string(predictor_id), params);
766

767
  if (use_static_engine) {
N
nhzlx 已提交
768 769 770 771
    trt_engine_serialized_data = GetTrtEngineSerializedData(
        Get<std::string>("model_opt_cache_dir"), engine_key);
    // we can load the engine info serialized before from the disk.
    if (!trt_engine_serialized_data.empty()) {
772 773 774 775 776
      try {
        trt_engine->Deserialize(trt_engine_serialized_data);
        LOG(INFO) << "Load TRT Optimized Info from "
                  << GetTrtEngineSerializedPath(
                         Get<std::string>("model_opt_cache_dir"), engine_key);
W
Wilber 已提交
777
        return engine_key + std::to_string(predictor_id);
778 779 780 781 782 783 784 785
      } catch (const std::exception &exp) {
        LOG(WARNING)
            << "Fail to load TRT Optimized Info from "
            << GetTrtEngineSerializedPath(
                   Get<std::string>("model_opt_cache_dir"), engine_key)
            << ". Engine deserialization failed: Serialized Engine Version "
               "does not match Current Version, TRT engine will be rebuilded";
      }
786 787 788
    }
  }

C
cyberslack_lee 已提交
789
  // If with_dynamic_shape is configured, but min_input_shape is empty,
790 791
  // create trt engine in runtime instead of in pass.
  if (with_dynamic_shape && min_input_shape.empty()) {
W
Wilber 已提交
792
    return engine_key + std::to_string(predictor_id);
793 794
  }

N
nhzlx 已提交
795 796 797 798 799
  // the following code will NOT run in following situation:
  // 1. calibraion mode (generate trt int8 calibraiton table data)
  // 2. already load serialized trt engine info.
  LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
               "kernel etc). This process may cost a lot of time.";
800

N
nhzlx 已提交
801
  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
802 803
  std::unordered_set<std::string> parameters_set(parameters.begin(),
                                                 parameters.end());
N
nhzlx 已提交
804 805
  inference::Singleton<inference::tensorrt::OpConverter>::Global()
      .ConvertBlockToTRTEngine(
806 807
          &block_desc_temp,
          *scope,
N
nhzlx 已提交
808
          std::vector<std::string>(input_names.begin(), input_names.end()),
809
          parameters_set,
810 811
          output_mapping,
          trt_engine);
N
nhzlx 已提交
812

813
  if (use_static_engine) {
814 815 816 817
    nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
    trt_engine_serialized_data =
        std::string((const char *)serialized_engine_data->data(),
                    serialized_engine_data->size());
N
nhzlx 已提交
818 819 820 821
    SaveTrtEngineSerializedDataToFile(
        GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
                                   engine_key),
        trt_engine_serialized_data);
822 823 824
    LOG(INFO) << "Save TRT Optimized Info to "
              << GetTrtEngineSerializedPath(
                     Get<std::string>("model_opt_cache_dir"), engine_key);
N
nhzlx 已提交
825
  }
W
Wilber 已提交
826 827

  return engine_key + std::to_string(predictor_id);
N
nhzlx 已提交
828 829
}

830 831 832 833 834 835 836
}  // namespace analysis
}  // namespace inference
}  // namespace paddle

REGISTER_PASS(tensorrt_subgraph_pass,
              paddle::inference::analysis::TensorRtSubgraphPass)
    .RequirePassAttr("max_batch_size")
837 838
    .RequirePassAttr("workspace_size")
    .RequirePassAttr("min_subgraph_size");
839 840 841 842

REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
    .AddCombination(
        paddle::framework::compatible::OpVersionComparatorCombination()
843
            .LE("conv2d", 1)
844 845 846 847 848
            .EQ("pool2d", 0)
            .EQ("relu", 0)
            .EQ("softmax", 0)
            .EQ("sigmoid", 0)
            .EQ("hard_swish", 0)
849
            .LE("depthwise_conv2d", 1)
850 851 852 853
            .EQ("batch_norm", 0)
            .EQ("concat", 0)
            .EQ("tanh", 0)
            .EQ("pad", 0)
854 855
            .LE("elementwise_add", 1)
            .LE("elementwise_mul", 1)
856
            .EQ("prelu", 0)
857
            .LE("conv2d_transpose", 2)
858 859 860 861
            .LE("leaky_relu", 1)
            .EQ("fc", 0)
            .EQ("shuffle_channel", 0)
            .EQ("swish", 0)
L
LielinJiang 已提交
862
            .EQ("silu", 0)
863
            .EQ("split", 0)
864
            .LE("instance_norm", 1)
865 866
            .EQ("gelu", 0)
            .EQ("layer_norm", 0)
867
            .EQ("scale", 0)
868
            .LE("matmul", 1));