// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include #include #include #include #include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/subgraph_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" namespace paddle { namespace inference { namespace analysis { namespace { // if in mixed model precision, we should make all tensorrt_engine's output // floats dtype to float32 dtype. void OutputProcess(framework::ir::Graph *graph, const std::unordered_set &trt_outputs, phi::Backend backend, phi::DataType precision, const std::unordered_set &blacklist, const std::unordered_set &whitelist) { framework::BlockDesc *block_desc{nullptr}; int suffix = 0; std::unordered_map var_to_cast_op_map; framework::proto::VarType::Type to_type; if (precision == phi::DataType::FLOAT16) { to_type = framework::proto::VarType::FP16; } else if (precision == phi::DataType::BFLOAT16) { to_type = framework::proto::VarType::BF16; } else if (precision == phi::DataType::FLOAT32) { return; } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "mixed_precision currently not supported dtype %d, we now only support " "fp16 and bf16.", static_cast(precision))); } for (auto *op_node : framework::ir::TopologySortOperations(*graph)) { if (!op_node->IsOp()) continue; auto op_type = op_node->Op()->Type(); if (op_type == "feed") block_desc = op_node->Op()->Block(); if (op_type != "tensorrt_engine") continue; for (auto *var_node : op_node->outputs) { if (!trt_outputs.count(var_node)) continue; if (!var_node->Var()->Persistable() && IsFloatVar(var_node->Var()->GetDataType()) && var_node->Var()->GetDataType() != framework::proto::VarType::FP32) { for (auto *next_op : var_node->outputs) { // if next_op support mixed_precision, we need to add cast op. if (OpSupportPrecision( phi::TransToPhiKernelName(next_op->Op()->Type()), backend, precision, blacklist, whitelist)) { InsertCastOp(graph, var_node, next_op, framework::proto::VarType::FP32, to_type, block_desc, &suffix, &var_to_cast_op_map); var_node->Var()->SetDataType(framework::proto::VarType::FP32); } } } } } } // Determine whether the whole graph offload to tensorrt. If so we can try to // enable optimization such as cudaGraph. bool AllNodesLowerToTrtPostProcess(framework::ir::Graph *graph) { std::unordered_set trt_nodes_set{ "feed", "fetch", "tensorrt_engine"}; bool all_nodes_offload_to_trt = true; for (auto *node : graph->Nodes()) { if (node->IsOp()) { if (!trt_nodes_set.count(node->Op()->Type())) { all_nodes_offload_to_trt = false; break; } } } return all_nodes_offload_to_trt; } } // namespace using framework::ir::Node; void analysis::TensorRtSubgraphPass::ApplyImpl( framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); auto model_precision = static_cast(Get("model_precision")); if (model_precision == phi::DataType::BFLOAT16) { LOG(WARNING) << "Paddle-TRT not support bf16 mixed precison, just fallback."; return; } auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); bool use_cuda_graph = Get("use_cuda_graph"); bool no_calib_int8 = enable_int8 && !(use_calib_mode); auto trt_disabled_ops = Get>("trt_disabled_ops"); auto with_dynamic_shape = Get("with_dynamic_shape"); auto teller = [&](const framework::ir::Node *node) { if (!node->IsOp() || !node->Op()) return false; if (find(trt_disabled_ops.begin(), trt_disabled_ops.end(), node->Op()->Type()) != trt_disabled_ops.end()) { VLOG(3) << node->Op()->Type().c_str() << " is diabled by config in TensorRT"; return false; } for (const auto &out_var : node->Op()->OutputNames()) { for (const auto &var_name : node->Op()->Output(out_var)) { if (find(trt_disabled_ops.begin(), trt_disabled_ops.end(), var_name) != trt_disabled_ops.end()) { VLOG(3) << node->Op()->Type().c_str() << " is diabled by config in TensorRT"; return false; } } } bool is_ok = tensorrt::OpTeller::Global().Tell( node, no_calib_int8, with_dynamic_shape); if (!is_ok) VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT"; return is_ok; }; framework::ir::SubGraphFuser fuser( graph, teller, Get("min_subgraph_size") /*min subgraph size*/, "tensorrt_engine"); fuser(); std::vector graph_param_names = ExtractParameters(graph->Nodes()); // those parameter already exist in trt, and should not have another copy in // fluid. std::vector repetitive_params; std::vector engine_names; for (auto *node : graph->Nodes()) { if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { engine_names.push_back(CreateTensorRTOp( node, graph, graph_param_names, &repetitive_params, use_cuda_graph)); } } std::unordered_set nodes2remove; for (auto *node : graph->Nodes()) { if (node->IsOp() && framework::ir::Agent(node).deleted()) { nodes2remove.insert(node); } } framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); graph->Set(framework::ir::kRepetitiveParamAttr, new std::vector(repetitive_params)); bool all_nodes_offload_to_trt = AllNodesLowerToTrtPostProcess(graph); if (all_nodes_offload_to_trt) { LOG(INFO) << "The entire graph is offloaded to TensorRT."; } if (use_cuda_graph && !all_nodes_offload_to_trt) { LOG_FIRST_N(WARNING, 1) << "You have enabled CudaGraph, but not the entire graph offload to " "trt, now return to normal mode."; use_cuda_graph = false; } if (use_cuda_graph && all_nodes_offload_to_trt) { for (auto &name : engine_names) { PADDLE_ENFORCE_EQ( paddle::inference::Singleton< inference::tensorrt::TRTEngineManager>::Global() .Has(name), true, platform::errors::PreconditionNotMet( "TRTEnegineManager shoud has engine %s, but not found.", name)); paddle::inference::Singleton< inference::tensorrt::TRTEngineManager>::Global() .Get(name) ->SetAllNodesLowerToTrt(use_cuda_graph); } } // some ops are only implemented in paddle-trt, // but not in paddle ,we should revert it. for (auto *op_node : framework::ir::TopologyVarientSort( *graph, static_cast(0))) { if (op_node->Op()->Type() == "matrix_multiply") { auto origin_type = op_node->Op()->GetAttrIfExists("original_type"); LOG(WARNING) << "matrix_multiply can't enter into paddle-trt," << "we will revert to " << origin_type; op_node->Op()->SetType(origin_type); op_node->RenameOp(origin_type); } } } std::string GenerateEngineKey(const std::set &engine_inputs, const std::set &engine_outputs, const std::string &predictor_id, const std::string &max_batch_size, const std::string &precision, bool use_cuda_graph, const bool for_calibration) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; engine_hash_key += "#"; } for (auto name : engine_outputs) { engine_hash_key += name; engine_hash_key += "#"; } engine_hash_key += predictor_id; if (!for_calibration) { engine_hash_key += "#"; engine_hash_key += max_batch_size; } engine_hash_key += "#"; engine_hash_key += precision; engine_hash_key += "#"; engine_hash_key += use_cuda_graph; auto engine_key = std::to_string(std::hash()(engine_hash_key)); VLOG(2) << "TRT engine hash key: " << engine_hash_key; VLOG(2) << "TRT engine key: " << engine_key; return engine_key; } std::string TensorRtSubgraphPass::CreateTensorRTOp( framework::ir::Node *node, framework::ir::Graph *graph, const std::vector &graph_params, std::vector *repetitive_params, bool use_cuda_graph) const { auto *op_desc = node->Op(); auto &subgraph = *framework::ir::Agent(node).subgraph(); PADDLE_ENFORCE_EQ(subgraph.empty(), false, platform::errors::PreconditionNotMet( "The subgraph should not be empty.")); framework::ProgramDesc *program_desc = Get("program"); // Add new block for TensorRTEngineOP const framework::BlockDesc &main_block = program_desc->Block(framework::kRootBlockIndex); framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); // A fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes"; for (auto node : subgraph) { if (node->NodeType() == Node::Type::kOperation) { VLOG(5) << "trt subgraph has op: " << (node->Op()->Type()); } } for (auto *node : subgraph) { auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } // Then, we will use the input_names_with_id and output_names_with_id to // generate the engine key. // So, We use set instead of unordered_set here to ensure that the engine key // is unique. std::set input_names; std::set input_names_with_id; std::vector parameters; // if we delete fluid copy of parameters shared by more than 1 ops, there will // be problem, so we filter them out. std::vector params_not_shared; auto *scope = param_scope(); // The node->inputs contains input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { parameters.push_back(x->Name()); } if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 && x->outputs.size() <= 1) { params_not_shared.push_back(x->Name()); } // When TRT Engine's input is INT64 or FP64, we need do some extra work. // So we reserved a name for later use when casting INT64 -> INT32 or // FP64->FP32. We must check whether scope has had the same name var! if (x->Var()->GetDataType() == framework::proto::VarType::INT64) { LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name() << " having int64 dtype in pdmodel description, we will cast them to " "int32 dtype to feed them into paddle-trt."; } else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) { LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name() << " having float64 dtype in pdmodel description, we will " "cast them to " "float32 dtype to feed them into paddle-trt."; } } // var may have the same name but not have the same id. // e.g., var(batch_norm2d_0.w_1) may have id: 10, 13, 25.... in a graph. // so we must find all the var_name+id. // https://github.com/PaddlePaddle/Paddle/pull/53184 for (auto *n : graph->Nodes()) { if (n->IsVar() && input_names.count(n->Name())) { input_names_with_id.insert(n->Name() + std::to_string(n->id())); } } auto model_precision = static_cast(Get("model_precision")); auto mixed_black_list = Get>("mixed_black_list"); auto mixed_white_list = Get>("mixed_white_list"); std::set output_names; std::set output_names_with_id; std::map origin_name_output_rank; std::unordered_set trt_outputs; // record the origin output data type std::vector origin_outputs_dtype; std::map map_origin_outputs_dtype; // Mark TensorRT output nodes as trt outputs auto mark_output = Get("mark_output"); auto output_tensor_name = Get>("output_tensor_names"); auto mark_output_with_id = Get("mark_output_with_id"); if (mark_output) { VLOG(1) << "begin to mark output ..."; for (auto node : subgraph) { if (node->NodeType() == Node::Type::kOperation) { for (auto *x : node->outputs) { if (std::count(parameters.begin(), parameters.end(), x->Name()) > 0) continue; std::string name_with_id = x->Name() + std::to_string(x->id()); if (((!mark_output_with_id && std::count(output_tensor_name.begin(), output_tensor_name.end(), x->Name()) > 0) || (mark_output_with_id && std::count(output_tensor_name.begin(), output_tensor_name.end(), name_with_id) > 0)) && !x->outputs.empty()) { VLOG(3) << "output " << x->Name() << " has been marked"; output_names.insert(x->Name()); output_names_with_id.insert(name_with_id); origin_name_output_rank[x->Name()] = x->Var()->GetShape().size(); trt_outputs.insert(x); map_origin_outputs_dtype[x->Name()] = static_cast(x->Var()->GetDataType()); } } } } } for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); origin_name_output_rank[x->Name()] = x->Var()->GetShape().size(); trt_outputs.insert(x); map_origin_outputs_dtype[x->Name()] = static_cast(x->Var()->GetDataType()); } OutputProcess(graph, trt_outputs, phi::Backend::GPU, model_precision, mixed_black_list, mixed_white_list); std::unordered_map output_name_map; std::unordered_map graph_var_map; for (framework::ir::Node *node : graph->Nodes()) { if (node->IsVar() && node->Var()) { graph_var_map[node->Name()] = node; } } auto precision_mode = static_cast(Get("trt_precision_mode")); bool enable_fp16 = false; if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true; auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *framework::ir::Agent(node).subgraph(); auto min_input_shape = Get>>("min_input_shape"); auto max_input_shape = Get>>("max_input_shape"); auto optim_input_shape = Get>>("optim_input_shape"); auto min_shape_tensor = Get>>("min_shape_tensor"); auto max_shape_tensor = Get>>("max_shape_tensor"); auto optim_shape_tensor = Get>>("optim_shape_tensor"); auto allow_build_at_runtime = Get("trt_allow_build_at_runtime"); auto with_dynamic_shape = Get("with_dynamic_shape"); auto shape_range_info_path = Get("trt_shape_range_info_path"); auto trt_tuned_dynamic_shape = Get("trt_tuned_dynamic_shape"); int max_batch_size = Get("max_batch_size"); if (trt_tuned_dynamic_shape) { if (!shape_range_info_path.empty()) { VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path; inference::DeserializeShapeRangeInfo(shape_range_info_path, &min_input_shape, &max_input_shape, &optim_input_shape, &min_shape_tensor, &max_shape_tensor, &optim_shape_tensor); } else { shape_range_info_path = Get("model_opt_cache_dir") + "shape_range_info.pbtxt"; if (open(shape_range_info_path.c_str(), O_RDONLY) != -1) { VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path; inference::DeserializeShapeRangeInfo(shape_range_info_path, &min_input_shape, &max_input_shape, &optim_input_shape, &min_shape_tensor, &max_shape_tensor, &optim_shape_tensor); } else { int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644); close(fd); } } } // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. // Why we do this? // During the transition from fluid OP to tensorrt OP, we map // the input and output Tensor(fluid data structure) of fluid OP // to the corresponding ITensor (trt data structure) through the // Tensor name. When we set up ITensor for an variable, we must // ensure that it has not been set before. // If there is variable in the fluid graph, which is not only the // input of a OP, but also the output of a Op, there will be problems. // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, &output_names_with_id, &output_names, &output_name_map, graph_var_map, !enable_int8); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor // to Tensor. std::vector output_mapping; std::vector renamed_output_rank; for (auto name : output_names) { PADDLE_ENFORCE_NE(output_name_map.count(name), 0, platform::errors::PreconditionNotMet( "The output_name_map should have %s", name)); output_mapping.push_back(output_name_map[name]); renamed_output_rank.push_back(origin_name_output_rank[name]); origin_outputs_dtype.push_back(map_origin_outputs_dtype[name]); // When TRT Engine's output is INT64 or FP64, we need do some extra work. // So we reserved a name for later use when casting INT32 -> INT64 or FP32 // -> FP64. We must check whether scope has had the same name var! if (static_cast( map_origin_outputs_dtype[name]) == framework::proto::VarType::INT64) { LOG(WARNING) << "tensorrt_subgraph's output named " << name << " having int64 dtype in pdmodel description, but in fact " "it is int32 " "dtype after executing this tensorrt_subgraph, so we " "need cast them into int64."; } else if (static_cast( map_origin_outputs_dtype[name]) == framework::proto::VarType::FP64) { LOG(WARNING) << "tensorrt_subgraph's output named " << name << " having float64 dtype in pdmodel description, but in fact " "it is float32 " "dtype after executing this tensorrt_subgraph, so we " "need cast them into float64."; } } PADDLE_ENFORCE_EQ(output_mapping.empty(), false, platform::errors::PreconditionNotMet( "The output_mapping should not be empty.")); PADDLE_ENFORCE_EQ( !block_desc.Proto()->vars().empty(), true, platform::errors::PreconditionNotMet("the block has no var-desc")); // Get pass attrs. auto use_varseqlen = Get("use_varseqlen"); auto with_interleaved = Get("with_interleaved"); auto tensorrt_transformer_posid = Get("tensorrt_transformer_posid"); auto tensorrt_transformer_maskid = Get("tensorrt_transformer_maskid"); auto use_dla = Get("trt_use_dla"); auto dla_core = Get("trt_dla_core"); auto use_inspector = Get("use_inspector"); auto disable_trt_plugin_fp16 = Get("disable_trt_plugin_fp16"); auto context_memory_sharing = Get("context_memory_sharing"); auto enable_low_precision_io = Get("enable_low_precision_io"); auto workspace_size = Get("workspace_size"); auto gpu_device_id = Get("gpu_device_id"); // Set op's attrs. op_desc->SetType("tensorrt_engine"); op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); op_desc->SetOutput( "Ys", std::vector(output_names.begin(), output_names.end())); op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype); op_desc->SetAttr("max_batch_size", max_batch_size); op_desc->SetAttr("workspace_size", workspace_size); op_desc->SetAttr("gpu_device_id", gpu_device_id); op_desc->SetAttr("output_name_mapping", output_mapping); op_desc->SetAttr("origin_output_rank", renamed_output_rank); op_desc->SetAttr("parameters", parameters); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("shape_range_info_path", shape_range_info_path); op_desc->SetAttr("use_inspector", use_inspector); op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape); op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io); if (!trt_tuned_dynamic_shape) { std::vector dynamic_shape_names; std::vector dynamic_shape_lens; std::vector min_input_shape_vector; std::vector max_input_shape_vector; std::vector opt_input_shape_vector; for (const auto &it : min_input_shape) { dynamic_shape_names.push_back(it.first); dynamic_shape_lens.push_back(it.second.size()); for (const auto &value : it.second) { min_input_shape_vector.push_back(value); } } for (const auto &it : max_input_shape) { for (const auto &value : it.second) { max_input_shape_vector.push_back(value); } } for (const auto &it : optim_input_shape) { for (const auto &value : it.second) { opt_input_shape_vector.push_back(value); } } op_desc->SetAttr("dynamic_shape_names", dynamic_shape_names); op_desc->SetAttr("dynamic_shape_lens", dynamic_shape_lens); op_desc->SetAttr("min_input_shape_vector", min_input_shape_vector); op_desc->SetAttr("max_input_shape_vector", max_input_shape_vector); op_desc->SetAttr("opt_input_shape_vector", opt_input_shape_vector); } // we record all inputs' shapes in attr to check if they are consistent // with the real inputs' shapes retrieved from scope when trt runs. for (auto *x : node->inputs) { if (x->IsVar() && x->Var()) { framework::VarDesc *var = x->Var(); op_desc->SetAttr(var->Name() + "_shape", var->GetShape()); } } auto use_static_engine = Get("use_static_engine"); op_desc->SetAttr("use_static_engine", use_static_engine); if (use_static_engine) op_desc->SetAttr("model_opt_cache_dir", Get("model_opt_cache_dir")); // TODO(NHZlX) // There are models with the same structure but the different parameters, // when running in the 'use_serialize' mode, there is a bug. // serialization is affected by max_batch_size, but calibration is not. // So we use separate engine keys in serialization and calibration. auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, std::to_string(0), std::to_string(max_batch_size), std::to_string(static_cast(precision_mode)), use_cuda_graph, false); auto calibration_engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, std::to_string(0), std::to_string(max_batch_size), std::to_string(static_cast(precision_mode)), use_cuda_graph, true); auto predictor_id = Get("predictor_id"); // Get "" when there is no cached calibration table data. std::string calibration_data = ""; if (enable_int8 && use_calib_mode) { calibration_data = GetTrtCalibTableData(Get("model_opt_cache_dir"), calibration_engine_key, enable_int8); } op_desc->SetAttr("calibration_data", calibration_data); op_desc->SetAttr("enable_int8", enable_int8); op_desc->SetAttr("enable_fp16", enable_fp16); op_desc->SetAttr("use_calib_mode", use_calib_mode); op_desc->SetAttr("engine_key", engine_key); op_desc->SetAttr("calibration_engine_key", calibration_engine_key); op_desc->SetAttr("predictor_id", predictor_id); op_desc->SetAttr("use_varseqlen", use_varseqlen); op_desc->SetAttr("with_interleaved", with_interleaved); op_desc->SetAttr("use_dla", use_dla); op_desc->SetAttr("dla_core", dla_core); op_desc->SetAttr("disable_trt_plugin_fp16", disable_trt_plugin_fp16); op_desc->SetAttr("context_memory_sharing", context_memory_sharing); std::string trt_engine_serialized_data; op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data); op_desc->Flush(); std::unique_ptr calibrator; if (enable_int8 && !calibration_data.empty()) { calibrator = std::make_unique(calibration_data); LOG(INFO) << "RUN Paddle TRT int8 calibration mode..."; } // When in int8 mode and calibration_mode, the program just produce the // calibration table data. bool calibration_mode = (enable_int8 && calibration_data.empty() && use_calib_mode); if (calibration_mode) { // calibraion mode means generate int8 calibration table data process. return calibration_engine_key; } std::copy(params_not_shared.begin(), params_not_shared.end(), std::back_inserter(*repetitive_params)); // Check trt version for dynamic shape input. if (!min_input_shape.empty() && TRT_VERSION < 6000) { LOG_FIRST_N(WARNING, 1) << "You are using the dynamic size input mode of " "Paddle-TRT, but we found that the version of " "the TensorRT is less than 6.0, so we use the " "static shape mode instead."; min_input_shape = {}; max_input_shape = {}; optim_input_shape = {}; } const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION); const float trt_runtime_version = tensorrt::TrtMajorVersion(tensorrt::GetInferLibVersion()); if (trt_compile_version != trt_runtime_version) { LOG_FIRST_N(WARNING, 1) << "The Paddle Inference library is compiled with " << trt_compile_version << " version TensorRT, " << "but the runtime TensorRT you are using is " << trt_runtime_version << " version. " "This might cause serious compatibility issues. We strongly " "recommend using the same TRT version at runtime."; } std::unordered_set nodes2remove( framework::ir::Agent(node).subgraph()->begin(), framework::ir::Agent(node).subgraph()->end()); framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); tensorrt::TensorRTEngine::ConstructionParams params; params.max_batch_size = max_batch_size; params.max_workspace_size = workspace_size; params.calibrator = calibrator.get(); params.device_id = gpu_device_id; params.with_dynamic_shape = with_dynamic_shape; params.min_input_shape = min_input_shape; params.max_input_shape = max_input_shape; params.optim_input_shape = optim_input_shape; params.min_shape_tensor = min_shape_tensor; params.max_shape_tensor = max_shape_tensor; params.optim_shape_tensor = optim_shape_tensor; params.disable_trt_plugin_fp16 = disable_trt_plugin_fp16; params.precision = precision_mode; params.use_varseqlen = use_varseqlen; params.use_dla = use_dla; params.dla_core = dla_core; params.with_interleaved = with_interleaved; params.tensorrt_transformer_posid = tensorrt_transformer_posid; params.tensorrt_transformer_maskid = tensorrt_transformer_maskid; params.context_memory_sharing = context_memory_sharing; params.use_inspector = use_inspector; params.enable_low_precision_io = enable_low_precision_io; tensorrt::TensorRTEngine *trt_engine = inference::Singleton::Global() .Create(engine_key + std::to_string(predictor_id), params); if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( Get("model_opt_cache_dir"), engine_key); // we can load the engine info serialized before from the disk. if (!trt_engine_serialized_data.empty()) { try { trt_engine->Deserialize(trt_engine_serialized_data); LOG(INFO) << "Load TRT Optimized Info from " << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key); return engine_key + std::to_string(predictor_id); } catch (const std::exception &exp) { LOG(WARNING) << "Fail to load TRT Optimized Info from " << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key) << ". Engine deserialization failed: Serialized Engine Version " "does not match Current Version, TRT engine will be rebuilded"; } } } // If with_dynamic_shape is configured, but min_input_shape is empty, // create trt engine in runtime instead of in pass. if (with_dynamic_shape && min_input_shape.empty()) { return engine_key + std::to_string(predictor_id); } // the following code will NOT run in following situation: // 1. calibraion mode (generate trt int8 calibraiton table data) // 2. already load serialized trt engine info. LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); std::unordered_set parameters_set(parameters.begin(), parameters.end()); inference::Singleton::Global() .ConvertBlockToTRTEngine( &block_desc_temp, *scope, std::vector(input_names.begin(), input_names.end()), parameters_set, output_mapping, trt_engine); if (use_static_engine) { nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); trt_engine_serialized_data = std::string((const char *)serialized_engine_data->data(), serialized_engine_data->size()); SaveTrtEngineSerializedDataToFile( GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), engine_key), trt_engine_serialized_data); LOG(INFO) << "Save TRT Optimized Info to " << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key); } return engine_key + std::to_string(predictor_id); } } // namespace analysis } // namespace inference } // namespace paddle REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) .RequirePassAttr("max_batch_size") .RequirePassAttr("workspace_size") .RequirePassAttr("min_subgraph_size"); REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .LE("conv2d", 1) .EQ("pool2d", 0) .EQ("relu", 0) .EQ("softmax", 0) .EQ("sigmoid", 0) .EQ("hard_swish", 0) .LE("depthwise_conv2d", 1) .EQ("batch_norm", 0) .EQ("concat", 0) .EQ("tanh", 0) .EQ("pad", 0) .LE("elementwise_add", 1) .LE("elementwise_mul", 1) .EQ("prelu", 0) .LE("conv2d_transpose", 2) .LE("leaky_relu", 1) .EQ("fc", 0) .EQ("shuffle_channel", 0) .EQ("swish", 0) .EQ("silu", 0) .EQ("split", 0) .LE("instance_norm", 1) .EQ("gelu", 0) .EQ("layer_norm", 0) .EQ("scale", 0) .LE("matmul", 1));