diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 6139ccb31378874f74fafcc4822993d89b218111..21d2a602001ad834943ac67eb4c81bd039f15b6c 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -249,7 +249,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { subgraphes_[i] = graph->GetSubGraph(i); all_op_nodes_[i] = TopologySortOperations(*subgraphes_[i]); VLOG(4) << "subgraph " << i << " has " << all_op_nodes_[i].size() - << "op nodes"; + << " op nodes"; for (auto* var_node : subgraphes_[i]->Nodes()) { if (!var_node->IsVar()) continue; diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 8492db828a4edb96dffb94061b7fadafff98b696..2289543dd44f5b71c36434bd590c7951ae397157 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -64,10 +64,6 @@ void NaiveExecutor::Run() { VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); -#ifdef PADDLE_WITH_NVTX - platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(), - platform::NvtxRangeColor::Green); -#endif for (auto &func : input_hookfuncs_) { func(op.get(), scope_); @@ -77,7 +73,14 @@ void NaiveExecutor::Run() { op->SetOutputHooks(output_hookfuncs_); } +#ifdef PADDLE_WITH_NVTX + platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(), + platform::NvtxRangeColor::Green); +#endif op->Run(*scope_, place_); +#ifdef PADDLE_WITH_NVTX + platform::CudaNvtxRangePop(); +#endif // Update the shared_holder so that only records the max one. if (reuse_cache_.count(op.get())) { @@ -105,9 +108,6 @@ void NaiveExecutor::Run() { } } -#ifdef PADDLE_WITH_NVTX - platform::CudaNvtxRangePop(); -#endif for (auto &func : output_hookfuncs_) { func(op.get(), scope_); } diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 2b827f87ada821623f9b1f0f348bec1aad6a921e..e5224b9fa5b415aaf3c1382de3ceafc8ee457f15 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -14,8 +14,10 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" + #include #include +#include #include #include @@ -32,6 +34,7 @@ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -124,11 +127,6 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); - static std::once_flag trt_plugin_registered; - std::call_once(trt_plugin_registered, []() { - tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt(); - }); - auto model_precision = static_cast(Get("model_precision")); if (model_precision == phi::DataType::BFLOAT16) { @@ -291,7 +289,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( // Add new block for TensorRTEngineOP const framework::BlockDesc &main_block = program_desc->Block(framework::kRootBlockIndex); - // const framework::BlockDesc& main_block = program_desc->Block(0); framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); // A fake block desc. @@ -319,9 +316,9 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( // is unique. std::set input_names; std::set input_names_with_id; - std::vector params; - // if we delete fluid copy of params shared by more than 1 ops, there will be - // problem, so we filter them out. + std::vector parameters; + // if we delete fluid copy of parameters shared by more than 1 ops, there will + // be problem, so we filter them out. std::vector params_not_shared; auto *scope = param_scope(); @@ -330,7 +327,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { - params.push_back(x->Name()); + parameters.push_back(x->Name()); } if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 && x->outputs.size() <= 1) { @@ -340,33 +337,15 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( // So we reserved a name for later use when casting INT64 -> INT32 or // FP64->FP32. We must check whether scope has had the same name var! if (x->Var()->GetDataType() == framework::proto::VarType::INT64) { - std::string tmp_name = x->Name() + "_cast_to_INT32"; LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name() << " having int64 dtype in pdmodel description, we will cast them to " "int32 dtype to feed them into paddle-trt."; - /* - PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), - nullptr, - platform::errors::InvalidArgument( - "The var name %s has exists in scope.", - tmp_name)); - */ - scope->Var(tmp_name); } else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) { - std::string tmp_name = x->Name() + "_cast_to_FP32"; LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name() << " having float64 dtype in pdmodel description, we will " "cast them to " "float32 dtype to feed them into paddle-trt."; - /* - PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), - nullptr, - platform::errors::InvalidArgument( - "The var name %s has exists in scope.", - tmp_name)); - */ - scope->Var(tmp_name); } } @@ -412,10 +391,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( graph_var_map[node->Name()] = node; } } - auto precision_mode = Get("trt_precision_mode"); + auto precision_mode = + static_cast(Get("trt_precision_mode")); bool enable_fp16 = false; - if (precision_mode == static_cast(phi::DataType::FLOAT16)) - enable_fp16 = true; + if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true; auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *framework::ir::Agent(node).subgraph(); @@ -423,14 +402,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( Get>>("min_input_shape"); auto max_input_shape = Get>>("max_input_shape"); - auto opt_input_shape = + auto optim_input_shape = Get>>("optim_input_shape"); auto min_shape_tensor = Get>>("min_shape_tensor"); auto max_shape_tensor = Get>>("max_shape_tensor"); - auto opt_shape_tensor = + auto optim_shape_tensor = Get>>("optim_shape_tensor"); auto allow_build_at_runtime = Get("trt_allow_build_at_runtime"); @@ -444,10 +423,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( inference::DeserializeShapeRangeInfo(shape_range_info_path, &min_input_shape, &max_input_shape, - &opt_input_shape, + &optim_input_shape, &min_shape_tensor, &max_shape_tensor, - &opt_shape_tensor); + &optim_shape_tensor); } else { shape_range_info_path = Get("model_opt_cache_dir") + "shape_range_info.pbtxt"; @@ -457,10 +436,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( inference::DeserializeShapeRangeInfo(shape_range_info_path, &min_input_shape, &max_input_shape, - &opt_input_shape, + &optim_input_shape, &min_shape_tensor, &max_shape_tensor, - &opt_shape_tensor); + &optim_shape_tensor); } else { int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644); close(fd); @@ -509,32 +488,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( if (static_cast( map_origin_outputs_dtype[name]) == framework::proto::VarType::INT64) { - std::string tmp_name = name + "_cast_to_INT64"; LOG(WARNING) << "tensorrt_subgraph's output named " << name << " having int64 dtype in pdmodel description, but in fact " "it is int32 " "dtype after executing this tensorrt_subgraph, so we " "need cast them into int64."; - PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), - nullptr, - platform::errors::InvalidArgument( - "The var name %s has exists in scope.", tmp_name)); - scope->Var(tmp_name); } else if (static_cast( map_origin_outputs_dtype[name]) == framework::proto::VarType::FP64) { - std::string tmp_name = name + "_cast_to_FP64"; LOG(WARNING) << "tensorrt_subgraph's output named " << name << " having float64 dtype in pdmodel description, but in fact " "it is float32 " "dtype after executing this tensorrt_subgraph, so we " "need cast them into float64."; - PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), - nullptr, - platform::errors::InvalidArgument( - "The var name %s has exists in scope.", tmp_name)); - scope->Var(tmp_name); } } PADDLE_ENFORCE_EQ(output_mapping.empty(), @@ -546,30 +513,73 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( true, platform::errors::PreconditionNotMet("the block has no var-desc")); - // Set attrs + // Get pass attrs. + auto use_varseqlen = Get("use_varseqlen"); + auto with_interleaved = Get("with_interleaved"); + auto tensorrt_transformer_posid = + Get("tensorrt_transformer_posid"); + auto tensorrt_transformer_maskid = + Get("tensorrt_transformer_maskid"); + auto use_dla = Get("trt_use_dla"); + auto dla_core = Get("trt_dla_core"); + auto use_inspector = Get("use_inspector"); + auto disable_trt_plugin_fp16 = Get("disable_trt_plugin_fp16"); + auto context_memory_sharing = Get("context_memory_sharing"); + auto enable_low_precision_io = Get("enable_low_precision_io"); + auto workspace_size = Get("workspace_size"); + auto gpu_device_id = Get("gpu_device_id"); + + // Set op's attrs. op_desc->SetType("tensorrt_engine"); op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - op_desc->SetOutput( "Ys", std::vector(output_names.begin(), output_names.end())); - op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype); op_desc->SetAttr("max_batch_size", max_batch_size); - op_desc->SetAttr("workspace_size", Get("workspace_size")); - op_desc->SetAttr("gpu_id", Get("gpu_device_id")); + op_desc->SetAttr("workspace_size", workspace_size); + op_desc->SetAttr("gpu_device_id", gpu_device_id); op_desc->SetAttr("output_name_mapping", output_mapping); op_desc->SetAttr("origin_output_rank", renamed_output_rank); - op_desc->SetAttr("parameters", params); + op_desc->SetAttr("parameters", parameters); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("shape_range_info_path", shape_range_info_path); - op_desc->SetAttr("use_inspector", Get("use_inspector")); - op_desc->SetAttr("model_precision", Get("model_precision")); + op_desc->SetAttr("use_inspector", use_inspector); op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape); - op_desc->SetAttr("enable_low_precision_io", - Get("enable_low_precision_io")); + op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io); + + if (!trt_tuned_dynamic_shape) { + std::vector dynamic_shape_names; + std::vector dynamic_shape_lens; + std::vector min_input_shape_vector; + std::vector max_input_shape_vector; + std::vector opt_input_shape_vector; + for (const auto &it : min_input_shape) { + dynamic_shape_names.push_back(it.first); + dynamic_shape_lens.push_back(it.second.size()); + for (const auto &value : it.second) { + min_input_shape_vector.push_back(value); + } + } + for (const auto &it : max_input_shape) { + for (const auto &value : it.second) { + max_input_shape_vector.push_back(value); + } + } + for (const auto &it : optim_input_shape) { + for (const auto &value : it.second) { + opt_input_shape_vector.push_back(value); + } + } + + op_desc->SetAttr("dynamic_shape_names", dynamic_shape_names); + op_desc->SetAttr("dynamic_shape_lens", dynamic_shape_lens); + op_desc->SetAttr("min_input_shape_vector", min_input_shape_vector); + op_desc->SetAttr("max_input_shape_vector", max_input_shape_vector); + op_desc->SetAttr("opt_input_shape_vector", opt_input_shape_vector); + } // we record all inputs' shapes in attr to check if they are consistent // with the real inputs' shapes retrieved from scope when trt runs. @@ -624,14 +634,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("engine_key", engine_key); op_desc->SetAttr("calibration_engine_key", calibration_engine_key); op_desc->SetAttr("predictor_id", predictor_id); - - std::string trt_engine_serialized_data = ""; + op_desc->SetAttr("use_varseqlen", use_varseqlen); + op_desc->SetAttr("with_interleaved", with_interleaved); + op_desc->SetAttr("use_dla", use_dla); + op_desc->SetAttr("dla_core", dla_core); + op_desc->SetAttr("disable_trt_plugin_fp16", disable_trt_plugin_fp16); + op_desc->SetAttr("context_memory_sharing", context_memory_sharing); + std::string trt_engine_serialized_data; op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data); op_desc->Flush(); std::unique_ptr calibrator; if (enable_int8 && !calibration_data.empty()) { - calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + calibrator = + std::make_unique(calibration_data); LOG(INFO) << "RUN Paddle TRT int8 calibration mode..."; } // When in int8 mode and calibration_mode, the program just produce the @@ -656,7 +672,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( "static shape mode instead."; min_input_shape = {}; max_input_shape = {}; - opt_input_shape = {}; + optim_input_shape = {}; } const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION); @@ -677,42 +693,33 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( framework::ir::Agent(node).subgraph()->end()); framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); - // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not - // run fp16. - // When running fp16, the output accuracy of the model will be affected, - // closing the plugin fp16 may bring some improvement on accuracy. - bool disable_trt_plugin_fp16 = Get("disable_trt_plugin_fp16"); + tensorrt::TensorRTEngine::ConstructionParams params; + params.max_batch_size = max_batch_size; + params.max_workspace_size = workspace_size; + params.calibrator = calibrator.get(); + params.device_id = gpu_device_id; + params.with_dynamic_shape = with_dynamic_shape; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; + params.min_shape_tensor = min_shape_tensor; + params.max_shape_tensor = max_shape_tensor; + params.optim_shape_tensor = optim_shape_tensor; + params.disable_trt_plugin_fp16 = disable_trt_plugin_fp16; + params.precision = precision_mode; + params.use_varseqlen = use_varseqlen; + params.use_dla = use_dla; + params.dla_core = dla_core; + params.with_interleaved = with_interleaved; + params.tensorrt_transformer_posid = tensorrt_transformer_posid; + params.tensorrt_transformer_maskid = tensorrt_transformer_maskid; + params.context_memory_sharing = context_memory_sharing; + params.use_inspector = use_inspector; + params.enable_low_precision_io = enable_low_precision_io; + tensorrt::TensorRTEngine *trt_engine = inference::Singleton::Global() - .Create(engine_key + std::to_string(predictor_id), - max_batch_size, - Get("workspace_size"), - static_cast(precision_mode), - calibrator.get(), - Get("gpu_device_id"), - with_dynamic_shape, - min_input_shape, - max_input_shape, - opt_input_shape, - min_shape_tensor, - max_shape_tensor, - opt_shape_tensor, - disable_trt_plugin_fp16, - static_cast(Get("model_precision"))); - trt_engine->SetUseOSS(Get("use_varseqlen")); - trt_engine->SetWithInterleaved(Get("with_interleaved")); - trt_engine->SetTransformerPosid( - Get("tensorrt_transformer_posid")); - trt_engine->SetTransformerMaskid( - Get("tensorrt_transformer_maskid")); - trt_engine->SetUseDLA(Get("trt_use_dla")); - trt_engine->SetDLACore(Get("trt_dla_core")); - trt_engine->SetUseInspector(Get("use_inspector")); - trt_engine->SetWithErnie( - graph->Has(framework::ir::kEmbEltwiseLayernormPass) && - graph->Has(framework::ir::kMultiheadMatmulPass)); - trt_engine->SetContextMemorySharing(Get("context_memory_sharing")); - trt_engine->SetLowPrecisionIO(Get("enable_low_precision_io")); + .Create(engine_key + std::to_string(predictor_id), params); if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( @@ -749,13 +756,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( "kernel etc). This process may cost a lot of time."; framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); - std::unordered_set param_set(params.begin(), params.end()); + std::unordered_set parameters_set(parameters.begin(), + parameters.end()); inference::Singleton::Global() .ConvertBlockToTRTEngine( &block_desc_temp, *scope, std::vector(input_names.begin(), input_names.end()), - param_set, + parameters_set, output_mapping, trt_engine); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index 60d94eae7a6e03395d43bb648076bc5e3c2f6a37..8106dfbb9e6aae1e3319cb9f7ee3a01d64d52c1f 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include #include #include "paddle/fluid/framework/executor.h" @@ -63,8 +64,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { "set.")); } - auto graph = std::unique_ptr( - new framework::ir::Graph(argument->main_program())); + auto graph = std::make_unique(argument->main_program()); argument->SetMainGraph(graph.release()); auto *scope_ptr = argument->scope_ptr(); PADDLE_ENFORCE_NOT_NULL(scope_ptr, diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc index 2cfba89a91c359e725e8d52c0e3263835de7c465..80326b243ad461d84cfd16abc8b198ebbf60560c 100644 --- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc +++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc @@ -24,16 +24,6 @@ namespace inference { namespace analysis { void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { - if (!argument->save_optimized_model()) { - LOG(WARNING) << "save_optim_cache_model is turned off, skip " - "save_optimized_model_pass"; - return; - } - if (!argument->enable_ir_optim()) { - LOG(WARNING) << "ir_optim is turned off, skip save_optimized_model_pass"; - return; - } - std::string model_opt_cache_dir = argument->optim_cache_dir(); if (!model_opt_cache_dir.empty()) { if (!PathExists(model_opt_cache_dir)) { @@ -55,9 +45,11 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { auto* graph = argument->main_graph_ptr(); framework::ProgramDesc optimized_program_desc; + // NOTE(liuyuanle): If the following line of code is not added, an error // [SegmentFault] may occur! optimized_program_desc.CopyFrom(*argument->main_program().Proto()); + framework::ir::GraphToProgram(*graph, &optimized_program_desc); auto IsPersistable = [](const framework::VarDesc* var) { @@ -133,11 +125,10 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { } void SaveOptimizedModelPass::RunImpl(Argument* argument) { - // TODO(inference): Support trt. - if (argument->use_xpu() || - (argument->use_gpu() && !argument->use_tensorrt())) { - SaveOptimizedModel(argument); + if (!argument->save_optimized_model() || !argument->enable_ir_optim()) { + return; } + SaveOptimizedModel(argument); } std::string SaveOptimizedModelPass::repr() const { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 037b4a32211a718b888f34f90b8aaa84fcc0b0db..78fb47c4b80532bae9d0469adf14d1bb8c794ea5 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -641,7 +641,7 @@ bool AnalysisPredictor::PrepareProgram( } bool AnalysisPredictor::CreateExecutor() { - executor_.reset(new paddle::framework::NaiveExecutor(place_)); + executor_ = std::make_unique(place_); return true; } @@ -1341,7 +1341,7 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::PrepareArgument() { VLOG(3) << "AnalysisPredictor::PrepareArgument"; // Init std::unique_ptr argument_. - argument_.reset(new Argument); + argument_ = std::make_unique(); argument_->SetUseGPU(config_.use_gpu()); argument_->SetUseCutlass(config_.use_cutlass_); argument_->SetUseFcPadding(config_.use_fc_padding()); @@ -1570,7 +1570,8 @@ void AnalysisPredictor::PrepareArgument() { if (!config_.ir_optim()) { argument_->SetEnableIrOptim(false); - if (config_.enable_gpu_mixed_) { + if (config_.enable_gpu_mixed_ && + model_precision_ == phi::DataType::FLOAT32) { argument_->SetEnableIrOptim(true); pass_builder->ClearPasses(); pass_builder->AppendPass("auto_mixed_precision_pass"); @@ -1886,6 +1887,10 @@ AnalysisPredictor::GetInputTypes() { input_type[name] = paddle_infer::DataType::UINT8; } else if (dtype == paddle::framework::proto::VarType::INT8) { input_type[name] = paddle_infer::DataType::INT8; + } else if (dtype == paddle::framework::proto::VarType::FP64) { + input_type[name] = paddle_infer::DataType::FLOAT64; + } else if (dtype == paddle::framework::proto::VarType::BOOL) { + input_type[name] = paddle_infer::DataType::BOOL; } else { PADDLE_THROW(paddle::platform::errors::Unimplemented( "Unsupported data type `%s` when get input dtype ", dtype)); @@ -2609,7 +2614,7 @@ AnalysisPredictor::~AnalysisPredictor() { #ifdef PADDLE_WITH_TENSORRT if (config_.trt_engine_memory_sharing()) { inference::Singleton::Global() - .releaseContextMemory(predictor_id_); + .ReleaseContextMemory(predictor_id_); } #endif } diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index b07b20bb82121260a25ecd49b2287eca8cc37b6f..429bc89f0d90ea42366696dc26f9aad48f9158ca 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -167,7 +167,7 @@ class OpConverter { op_desc.Type())); it->SetEngine(engine); - engine->SetScope(scope); + engine->SetScope(&scope); it->SetBlockDesc(block); (*it)(op, scope, test_mode); @@ -301,7 +301,7 @@ class OpConverter { nvinfer1::DataType in_dtype = FluidDataType2TRT(var->GetDataType()); if (engine->precision() == phi::DataType::FLOAT16 && in_dtype == nvinfer1::DataType::kFLOAT && - engine->EnableLowPrecisionIO()) { + engine->LowPrecisionIOEnabled()) { in_dtype = nvinfer1::DataType::kHALF; } @@ -360,7 +360,7 @@ class OpConverter { nvinfer1::DataType out_dtype = FluidDataType2TRT(var->GetDataType()); if (engine->precision() == phi::DataType::FLOAT16 && out_dtype == nvinfer1::DataType::kFLOAT && - engine->EnableLowPrecisionIO()) { + engine->LowPrecisionIOEnabled()) { out_dtype = nvinfer1::DataType::kHALF; } engine->DeclareOutput(output, out_dtype); @@ -470,7 +470,7 @@ class OpConverter { auto shape = newShape->getDimensions(); shuffle->setReshapeDimensions(shape); } - if (name != "") { + if (!name.empty()) { shuffle->setName(name.c_str()); } return shuffle->getOutput(0); @@ -481,7 +481,7 @@ class OpConverter { const std::string& name = "") { auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); shuffle->setReshapeDimensions(shape); - if (name != "") { + if (!name.empty()) { shuffle->setName(name.c_str()); } return shuffle->getOutput(0); @@ -774,11 +774,6 @@ class OpConverter { bool test_mode_; private: - // registered op converter map, whose key is the fluid op type, and value is - // the pointer position of corresponding OpConverter class. - std::unordered_map converters_; - // fluid inference scope - framework::Scope* scope_{nullptr}; std::mutex mut_; }; diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc index 278c647ca582c02e0b5c75e6d9fbb74acf7a97bb..4ab1f7ca3f2baeb65c0e5129d6db549bfbbde73c 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include // NOLINT +#include #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -95,7 +96,11 @@ TEST(CustomPluginCreater, StaticShapePlugin) { // init trt engine std::unique_ptr engine_; - engine_.reset(new TensorRTEngine(5, 1 << 15)); + + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 5; + params.max_workspace_size = 1 << 15; + engine_ = std::make_unique(params); engine_->InitNetwork(); engine_->DeclareInput( @@ -173,15 +178,10 @@ TEST(CustomPluginCreater, DynamicShapePlugin) { std::map> optim_input_shape = { {"x", {1, 2, 5, 5}}}; - engine_.reset(new TensorRTEngine(5, - 1 << 15, - phi::DataType::FLOAT32, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape)); + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 5; + params.max_workspace_size = 1 << 15; + engine_ = std::make_unique(params); engine_->InitNetwork(); LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape(); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 3a15af255e5bce4118a23120b1c5c9293817ed65..b653df0bca83bccb8c6460e506b17985c3240b88 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include // NOLINT +#include + #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -28,7 +30,10 @@ TEST(OpConverter, ConvertBlock) { // init trt engine std::unique_ptr engine_; - engine_.reset(new TensorRTEngine(5, 1 << 15)); + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 5; + params.max_workspace_size = 1 << 15; + engine_ = std::make_unique(params); engine_->InitNetwork(); engine_->DeclareInput( diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index ee6b1393c8aae59a3c291f6816ae0b7dd939a92a..8a41d564a09da3e4c73ebf2a6aff2b2c44f12690 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -88,7 +88,10 @@ class TRTConvertValidation { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0, platform::errors::External("cudaStreamCreate error.")); - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size)); + TensorRTEngine::ConstructionParams params; + params.max_batch_size = max_batch_size; + params.max_workspace_size = workspace_size; + engine_ = std::make_unique(params); engine_->InitNetwork(); } @@ -155,7 +158,7 @@ class TRTConvertValidation { engine_->FreezeNetwork(); // Declare outputs. - op_desc_.reset(new framework::OpDesc(desc, nullptr)); + op_desc_ = std::make_unique(desc, nullptr); } // We use the set 'neglected_output' here, because some Ops like batch norm, diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 6e9beb810f0bd067bd324d56ba35c9d6a9c4f27e..7803989aa38c5f00f8eb1071e6b0b11efe9a1bb8 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -21,17 +21,15 @@ limitations under the License. */ #include "NvInferRuntimeCommon.h" #include "cuda_runtime_api.h" // NOLINT + #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/enforce.h" namespace paddle { namespace inference { namespace tensorrt { -int TensorRTEngine::runtime_batch_ = 1; thread_local int TensorRTEngine::predictor_id_per_thread = -1; void TensorRTEngine::Weight::SetDataType(phi::DataType type) { @@ -64,10 +62,10 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) { } void TensorRTEngine::InitNetwork() { - freshDeviceId(); + FreshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); - if (with_dynamic_shape_) { + if (with_dynamic_shape()) { infer_network_.reset(infer_builder_->createNetworkV2( 1U << static_cast( nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); @@ -92,7 +90,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { // IExecutionContext... // It's ok. We will set it later. nvinfer1::IExecutionContext *infer_context{nullptr}; - if (context_memory_sharing_) { + if (params_.context_memory_sharing) { infer_context = infer_engine_->createExecutionContextWithoutDeviceMemory(); } else { @@ -102,7 +100,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { infer_context, platform::errors::InvalidArgument( "TensorRT engine can not build execution context.")); - if (with_dynamic_shape_) { + if (with_dynamic_shape()) { // need new profile if it's not the first if (cur_profile_num_ > 0) { infer_context->setOptimizationProfile(cur_profile_num_); @@ -118,15 +116,15 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { void TensorRTEngine::Execute(int batch_size, std::vector *buffers, cudaStream_t stream) { - freshDeviceId(); + FreshDeviceId(); auto infer_context = context(); - if (context_memory_sharing_) { + if (params_.context_memory_sharing) { void *context_memory{nullptr}; context_memory = inference::Singleton::Global() - .getContextMemory( + .GetContextMemory( predictor_id_per_thread, - phi::GPUPlace(device_id_), + phi::GPUPlace(device_id()), phi::Stream(reinterpret_cast(stream))); infer_context->setDeviceMemory(context_memory); } @@ -182,12 +180,11 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context, } else { ret = context->enqueueV2(buffers->data(), stream, nullptr); } - SetRuntimeBatch(batch_size); return ret; } void TensorRTEngine::FreezeNetwork() { - freshDeviceId(); + FreshDeviceId(); VLOG(3) << "TRT to freeze network"; PADDLE_ENFORCE_NOT_NULL(infer_builder_, platform::errors::InvalidArgument( @@ -197,17 +194,17 @@ void TensorRTEngine::FreezeNetwork() { platform::errors::InvalidArgument( "Call InitNetwork first to initialize network.")); // build engine. - if (!with_dynamic_shape_) { - infer_builder_->setMaxBatchSize(max_batch_); + if (!with_dynamic_shape()) { + infer_builder_->setMaxBatchSize(params_.max_batch_size); } #if IS_TRT_VERSION_GE(8300) infer_builder_config_->setMemoryPoolLimit( - nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_); + nvinfer1::MemoryPoolType::kWORKSPACE, params_.max_workspace_size); #else - infer_builder_config_->setMaxWorkspaceSize(max_workspace_); + infer_builder_config_->setMaxWorkspaceSize(params_.max_workspace_size); #endif - bool enable_fp16 = (precision_ == phi::DataType::FLOAT16); + bool enable_fp16 = (precision() == phi::DataType::FLOAT16); if (enable_fp16) { bool support_fp16 = infer_builder_->platformHasFastFp16(); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); @@ -219,15 +216,15 @@ void TensorRTEngine::FreezeNetwork() { } } - bool enable_int8 = (precision_ == phi::DataType::INT8); + bool enable_int8 = (precision() == phi::DataType::INT8); if (enable_int8) { - if (!use_dla_) { + if (!use_dla()) { infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); } infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); - if (calibrator_) { - infer_builder_config_->setInt8Calibrator(calibrator_); + if (params_.calibrator) { + infer_builder_config_->setInt8Calibrator(params_.calibrator); } else { infer_builder_config_->setInt8Calibrator(nullptr); @@ -259,7 +256,7 @@ void TensorRTEngine::FreezeNetwork() { } } - if (use_dla_) { + if (use_dla()) { if (!enable_int8 && !enable_fp16) { LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " "set float32, so DLA is not used."; @@ -268,42 +265,43 @@ void TensorRTEngine::FreezeNetwork() { << "TensorRT DLA is set by config, but your device does not have " "DLA, so DLA is not used."; } else { - if (dla_core_ < 0 || dla_core_ >= infer_builder_->getNbDLACores()) { - dla_core_ = 0; + if (params_.dla_core < 0 || + params_.dla_core >= infer_builder_->getNbDLACores()) { + params_.dla_core = 0; LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " << infer_builder_->getNbDLACores() << ", but got " - << dla_core_ << ", so use use 0 as default."; + << params_.dla_core << ", so use use 0 as default."; } infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - infer_builder_config_->setDLACore(dla_core_); + infer_builder_config_->setDLACore(params_.dla_core); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore " - << dla_core_; + << params_.dla_core; } } - if (with_dynamic_shape_) { + if (with_dynamic_shape()) { LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; for (int i = 0; i < max_profile_num_; i++) { - for (auto &input : min_input_shape_) { + for (auto &input : min_input_shape()) { #if IS_TRT_VERSION_LT(7100) // trt6/trt7011 will check all_of input > 0 if (!(std::all_of(input.second.begin(), input.second.end(), [](int x) { return x > 0; }) && - std::all_of(max_input_shape_[input.first].begin(), - max_input_shape_[input.first].end(), + std::all_of(max_input_shape()[input.first].begin(), + max_input_shape()[input.first].end(), [](int x) { return x > 0; }) && - std::all_of(optim_input_shape_[input.first].begin(), - optim_input_shape_[input.first].end(), + std::all_of(optim_input_shape()[input.first].begin(), + optim_input_shape()[input.first].end(), [](int x) { return x > 0; }))) { continue; } #endif VLOG(4) << "TRT dynamic_shape set " << input.first << " min: " << Vec2Str(input.second) - << ", max: " << Vec2Str(max_input_shape_[input.first]) - << ", opt: " << Vec2Str(optim_input_shape_[input.first]); + << ", max: " << Vec2Str(max_input_shape()[input.first]) + << ", opt: " << Vec2Str(optim_input_shape()[input.first]); optim_profiles_[i]->setDimensions( input.first.c_str(), @@ -312,38 +310,39 @@ void TensorRTEngine::FreezeNetwork() { optim_profiles_[i]->setDimensions( input.first.c_str(), nvinfer1::OptProfileSelector::kMAX, - Vec2TRT_Dims(max_input_shape_[input.first], input.first, true)); + Vec2TRT_Dims(max_input_shape()[input.first], input.first, true)); optim_profiles_[i]->setDimensions( input.first.c_str(), nvinfer1::OptProfileSelector::kOPT, - Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); + Vec2TRT_Dims(optim_input_shape()[input.first], input.first, true)); } for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) { auto input_name = network()->getInput(input_id)->getName(); if (!itensor_map_.count(input_name)) continue; if (!GetITensor(input_name)->isShapeTensor()) continue; - PADDLE_ENFORCE_EQ(min_shape_tensor_.count(input_name) && - max_shape_tensor_.count(input_name) && - optim_shape_tensor_.count(input_name), + PADDLE_ENFORCE_EQ(min_shape_tensor().count(input_name) > 0 && + max_shape_tensor().count(input_name) > 0 && + optim_shape_tensor().count(input_name) > 0, true, platform::errors::InvalidArgument( "Fail to find min/max/optim shape value for TRT " "network's shape tensor input named %s.", input_name)); - auto min_vec = min_shape_tensor_.at(input_name); + auto min_vec = min_shape_tensor().at(input_name); optim_profiles_[i]->setShapeValues(input_name, nvinfer1::OptProfileSelector::kMIN, min_vec.data(), min_vec.size()); - optim_profiles_[i]->setShapeValues(input_name, - nvinfer1::OptProfileSelector::kMAX, - max_shape_tensor_[input_name].data(), - min_vec.size()); + optim_profiles_[i]->setShapeValues( + input_name, + nvinfer1::OptProfileSelector::kMAX, + max_shape_tensor()[input_name].data(), + min_vec.size()); optim_profiles_[i]->setShapeValues( input_name, nvinfer1::OptProfileSelector::kOPT, - optim_shape_tensor_[input_name].data(), + optim_shape_tensor()[input_name].data(), min_vec.size()); } @@ -358,7 +357,7 @@ void TensorRTEngine::FreezeNetwork() { } } #if IS_TRT_VERSION_GE(8200) - if (use_inspector_) { + if (params_.use_inspector) { infer_builder_config_->setProfilingVerbosity( nvinfer1::ProfilingVerbosity::kDETAILED); } @@ -388,12 +387,12 @@ void TensorRTEngine::FreezeNetwork() { cur_profile_num_ = 0; } // for engine context memory sharing - if (context_memory_sharing_) { + if (params_.context_memory_sharing) { inference::Singleton::Global() - .updateContextMemorySize(infer_engine_->getDeviceMemorySize(), + .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(), predictor_id_per_thread); } - if (use_inspector_) { + if (params_.use_inspector) { GetEngineInfo(); } } @@ -539,7 +538,7 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor( } // In fact , this is not always right, because we can't determine if the 0th // dimension is batch. Just for run chenqu's model - if (!this->with_dynamic_shape()) { + if (!with_dynamic_shape()) { trt_in_shape.nbDims--; for (int i = 0; i < trt_in_shape.nbDims; i++) { trt_in_shape.d[i] = trt_in_shape.d[i + 1]; @@ -563,12 +562,12 @@ std::unordered_map } void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { - freshDeviceId(); + FreshDeviceId(); infer_runtime_.reset(createInferRuntime(&logger_)); - if (use_dla_) { - if (precision_ != phi::DataType::INT8 && - precision_ != phi::DataType::FLOAT16) { + if (use_dla()) { + if (precision() != phi::DataType::INT8 && + precision() != phi::DataType::FLOAT16) { LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " "set float32, so DLA is not used."; } else if (infer_runtime_->getNbDLACores() == 0) { @@ -576,15 +575,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { << "TensorRT DLA is set by config, but your device does not have " "DLA, so DLA is not used."; } else { - if (dla_core_ < 0 || dla_core_ >= infer_runtime_->getNbDLACores()) { - dla_core_ = 0; + if (params_.dla_core < 0 || + params_.dla_core >= infer_runtime_->getNbDLACores()) { + params_.dla_core = 0; LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " << infer_runtime_->getNbDLACores() << ", but got " - << dla_core_ << ", so use use 0 as default."; + << params_.dla_core << ", so use use 0 as default."; } - infer_runtime_->setDLACore(dla_core_); + infer_runtime_->setDLACore(params_.dla_core); LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore " - << dla_core_; + << params_.dla_core; } } @@ -602,20 +602,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { binding_num_ = infer_engine_->getNbBindings(); // for engine context memory sharing - if (context_memory_sharing_) { + if (params_.context_memory_sharing) { inference::Singleton::Global() - .updateContextMemorySize(infer_engine_->getDeviceMemorySize(), + .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(), predictor_id_per_thread); } - if (use_inspector_) { + if (params_.use_inspector) { GetEngineInfo(); } } -void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { - runtime_batch_ = batch_size; -} - // Note: Only for support plugin. TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight( const std::string &name, const phi::DenseTensor &weight_tensor) { @@ -830,8 +826,6 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight( return weight; } -int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } - nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, @@ -856,16 +850,16 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt( return network()->addPluginV2(inputs, num_inputs, *plugin); } -void TensorRTEngine::freshDeviceId() { +void TensorRTEngine::FreshDeviceId() { int count; cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_id_, + PADDLE_ENFORCE_LT(device_id(), count, platform::errors::OutOfRange( "Device id %d exceeds the current device count: %d.", - device_id_, + device_id(), count)); - platform::SetDeviceId(device_id_); + platform::SetDeviceId(device_id()); } void TensorRTEngine::GetEngineInfo() { diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index bd678be2c1c0ededfb286272792436173178b40e..3ad2ca30a4cff99998c0f220f949b2e08d37ab99 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#include - #include #include #include @@ -25,21 +23,21 @@ limitations under the License. */ #include #include #include + +#include #include "NvInferRuntimeCommon.h" -#include "paddle/fluid/framework/lod_tensor.h" + #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" -#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/flags.h" #include "paddle/phi/core/stream.h" -#include "paddle/utils/any.h" PHI_DECLARE_bool(trt_ibuilder_cache); @@ -47,6 +45,12 @@ namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { +class PluginTensorRT; +} // namespace plugin + +class TRTInt8Calibrator; + // The code is mainly from TensorRT, thanks to the project. class TrtCudaGraph { public: @@ -105,140 +109,6 @@ class TrtCudaGraph { cudaGraphExec_t cuda_graph_exec_{}; }; -namespace plugin { -class PluginTensorRT; -} // namespace plugin - -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { // NOLINT - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - case FluidDT::VarType_Type_FP64: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - case FluidDT::VarType_Type_INT64: - return TRT_DT::kINT32; - case FluidDT::VarType_Type_FP16: - return TRT_DT::kHALF; -#if IS_TRT_VERSION_GE(8400) - case FluidDT::VarType_Type_BOOL: - return TRT_DT::kBOOL; - -#endif - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported datatype in TRT op converter, type: %s. " - "Boolean type is supported as TRT input/output " - "using TensorRT v8.4+.", - VarType_Type_Name(type))); - } - return TRT_DT::kINT32; -} - -// The T can be int32 or int64 type. -template -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, - std::string input, - bool with_dynamic_shape = false) { - PADDLE_ENFORCE_GE(shape.size(), - 0UL, - platform::errors::InvalidArgument( - "TensorRT's tensor input requires at least 0 " - "dimensions, but input %s has %d dims.", - input, - shape.size())); - - auto ShapeStr = [](const std::vector& shape) { - std::ostringstream os; - os << "["; - for (size_t i = 0; i < shape.size(); ++i) { - if (i == shape.size() - 1) { - os << shape[i]; - } else { - os << shape[i] << ","; - } - } - os << "]"; - return os.str(); - }; - if (!with_dynamic_shape) { - if (shape.size() == 4UL) { - if (shape[2] == -1 || shape[3] == -1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The input [%s] shape of trt subgraph is %s, please enable " - "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, - ShapeStr(shape))); - } - return nvinfer1::Dims3(shape[1], shape[2], shape[3]); - } else if (shape.size() == 5UL) { - if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The input [%s] shape of trt subgraph is %s, please enable " - "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, - ShapeStr(shape))); - } - return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]); - } else if (shape.size() == 3UL) { - if (shape[1] == -1 || shape[2] == -1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The input [%s] shape of trt subgraph is %s, please enable " - "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, - ShapeStr(shape))); - } - return nvinfer1::Dims2(shape[1], shape[2]); - } else if (shape.size() == 2UL) { - if (shape[1] == -1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The input [%s] shape of trt subgraph is %s, please enable " - "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, - ShapeStr(shape))); - } - nvinfer1::Dims dims; - dims.nbDims = 1; - dims.d[0] = shape[1]; - return dims; - } - // static shape doesn't support 1D op so far. - PADDLE_ENFORCE_NE(shape.size(), - 1UL, - platform::errors::InvalidArgument( - "The input [%s] shape of trt subgraph is %s." - "it's not supported by trt so far", - input, - ShapeStr(shape))); - - nvinfer1::Dims dims; - dims.nbDims = shape.size() - 1; - for (size_t i = 1; i < shape.size(); i++) { - dims.d[i - 1] = shape[i]; - } - return dims; - } else { - if (shape.size() == 4UL) { - return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); - } else if (shape.size() == 3UL) { - return nvinfer1::Dims3(shape[0], shape[1], shape[2]); - } - nvinfer1::Dims dims; - dims.nbDims = shape.size(); - for (size_t i = 0; i < shape.size(); i++) { - dims.d[i] = shape[i]; - } - return dims; - } -} -} // namespace - -class TRTInt8Calibrator; - /* * TensorRT Engine. * @@ -251,6 +121,54 @@ class TensorRTEngine { using PredictorID = int; public: + /* + * Construction parameters of TensorRTEngine. + */ + struct ConstructionParams { + // The max batch size. + int32_t max_batch_size; + + // The max memory size the engine uses. + int64_t max_workspace_size; + + // The precision of engine. + phi::DataType precision{phi::DataType::FLOAT32}; + + TRTInt8Calibrator* calibrator{nullptr}; + + // Use for engine context memory sharing. + bool context_memory_sharing{false}; + + int device_id{0}; + + bool with_dynamic_shape{false}; + + bool use_dla{false}; + int dla_core{0}; + + ShapeMapType min_input_shape; + ShapeMapType max_input_shape; + ShapeMapType optim_input_shape; + ShapeMapType min_shape_tensor; + ShapeMapType max_shape_tensor; + ShapeMapType optim_shape_tensor; + + bool use_inspector{false}; + + // + // From tensorrt_subgraph_pass, only used for OpConverter. + // + bool use_varseqlen{false}; + bool with_interleaved{false}; + std::string tensorrt_transformer_posid; + std::string tensorrt_transformer_maskid; + bool enable_low_precision_io{false}; + // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will + // not run fp16. When running fp16, the output accuracy of the model will be + // affected, closing the plugin fp16 may bring some improvement on accuracy. + bool disable_trt_plugin_fp16{false}; + }; + // Weight is model parameter. class Weight { public: @@ -276,47 +194,14 @@ class TensorRTEngine { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, - int64_t max_workspace, - phi::DataType precision = phi::DataType::FLOAT32, - TRTInt8Calibrator* calibrator = nullptr, - int device_id = 0, - bool with_dynamic_shape = false, - const ShapeMapType& min_input_shape = {}, - const ShapeMapType& max_input_shape = {}, - const ShapeMapType& optim_input_shape = {}, - const ShapeMapType& min_shape_tensor = {}, - const ShapeMapType& max_shape_tensor = {}, - const ShapeMapType& optim_shape_tensor = {}, - bool disable_trt_plugin_fp16 = false, - phi::DataType model_precision = phi::DataType::FLOAT32, + TensorRTEngine(const ConstructionParams& params, nvinfer1::ILogger& logger = NaiveLogger::Global()) - : max_batch_(max_batch), - max_workspace_(max_workspace), - precision_(precision), - calibrator_(calibrator), - device_id_(device_id), - with_dynamic_shape_(with_dynamic_shape), - min_input_shape_(min_input_shape), - max_input_shape_(max_input_shape), - optim_input_shape_(optim_input_shape), - min_shape_tensor_(min_shape_tensor), - max_shape_tensor_(max_shape_tensor), - optim_shape_tensor_(optim_shape_tensor), - disable_trt_plugin_fp16_(disable_trt_plugin_fp16), - model_precision_(model_precision), - logger_(logger) { - dy::initLibNvInferPlugins(&logger, ""); - } - - ~TensorRTEngine() { - for (auto& attr : attrs_) { - if (attr_dels_.find(attr.first) != attr_dels_.end()) { - attr_dels_[attr.first](); - } - } - attrs_.clear(); - attr_dels_.clear(); + : params_(params), logger_(logger) { + dy::initLibNvInferPlugins(&logger_, ""); + static std::once_flag trt_plugin_registered; + std::call_once(trt_plugin_registered, []() { + tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt(); + }); } // Add an input and set its name, data type and dimension. @@ -346,15 +231,6 @@ class TensorRTEngine { nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::IExecutionContext* context(); - int GetProfileIndex() { - if (max_profile_num_ > 1) { - std::unique_lock lock(mutex_); - return profile_index_[predictor_id_per_thread]; - } else { - return 0; - } - } - int GetBindingsOffset() { return (binding_num_ / max_profile_num_) * GetProfileIndex(); } @@ -390,25 +266,20 @@ class TensorRTEngine { void Deserialize(const std::string& engine_serialized_data); - void SetRuntimeBatch(size_t batch_size); - int GetRuntimeBatch(); - bool WithFp16() { - bool enable_fp16 = (precision_ == phi::DataType::FLOAT16); + bool enable_fp16 = (precision() == phi::DataType::FLOAT16); bool support_fp16 = infer_builder_->platformHasFastFp16(); // below is consistent with setFlag in engine.cc - bool fall_back_fp16 = WithInt8() && !use_dla_; + bool fall_back_fp16 = WithInt8() && !use_dla(); return (enable_fp16 || fall_back_fp16) && support_fp16; } bool WithInt8() { - bool enable_int8 = (precision_ == phi::DataType::INT8); + bool enable_int8 = (precision() == phi::DataType::INT8); bool support_int8 = infer_builder_->platformHasFastInt8(); return enable_int8 && support_int8; } - int GetDeviceId() { return device_id_; } - nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); @@ -471,19 +342,6 @@ class TensorRTEngine { suffix_counter += 1; } - void SetUseOSS(bool use_varseqlen) { use_varseqlen_ = use_varseqlen; } - void SetUseDLA(bool use_dla) { use_dla_ = use_dla; } - void SetDLACore(int dla_core) { dla_core_ = dla_core; } - void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; } - void SetWithInterleaved(bool with_interleaved) { - with_interleaved_ = with_interleaved; - } - void SetTransformerPosid(std::string tensorrt_transformer_posid) { - tensorrt_transformer_posid_ = tensorrt_transformer_posid; - } - void SetTransformerMaskid(std::string tensorrt_transformer_maskid) { - tensorrt_transformer_maskid_ = tensorrt_transformer_maskid; - } void ClearWeights() { for (auto& weight_pair : weight_map) { weight_pair.second.reset(nullptr); @@ -508,12 +366,12 @@ class TensorRTEngine { nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } - ShapeMapType& min_input_shape() { return min_input_shape_; } - ShapeMapType& max_input_shape() { return max_input_shape_; } - ShapeMapType& optim_input_shape() { return optim_input_shape_; } - ShapeMapType& min_shape_tensor() { return min_shape_tensor_; } - ShapeMapType& max_shape_tensor() { return max_shape_tensor_; } - ShapeMapType& optim_shape_tensor() { return optim_shape_tensor_; } + ShapeMapType& min_input_shape() { return params_.min_input_shape; } + ShapeMapType& max_input_shape() { return params_.max_input_shape; } + ShapeMapType& optim_input_shape() { return params_.optim_input_shape; } + ShapeMapType& min_shape_tensor() { return params_.min_shape_tensor; } + ShapeMapType& max_shape_tensor() { return params_.max_shape_tensor; } + ShapeMapType& optim_shape_tensor() { return params_.optim_shape_tensor; } bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape, const ShapeMapType& runtime_shape_tensor, @@ -526,22 +384,22 @@ class TensorRTEngine { auto name = it.first; auto input_shape = it.second; // Make 0-D tensor to 1-D tensor. - if (input_shape.size() == 0) { + if (input_shape.empty()) { input_shape.push_back(1); } bool min_change = false; bool max_change = false; std::vector bak_min_shape; std::vector bak_max_shape; - if (!min_input_shape_.count(name)) { - min_input_shape_[name] = input_shape; - max_input_shape_[name] = input_shape; - optim_input_shape_[name] = input_shape; + if (!params_.min_input_shape.count(name)) { + params_.min_input_shape[name] = input_shape; + params_.max_input_shape[name] = input_shape; + params_.optim_input_shape[name] = input_shape; min_change = true; max_change = true; ret = true; } else { - PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), + PADDLE_ENFORCE_EQ(params_.min_input_shape[name].size(), input_shape.size(), platform::errors::InvalidArgument( "TRT dynamic_shape min_input_shape %s size not " @@ -549,33 +407,33 @@ class TensorRTEngine { ", but the runtime_input_shape[%s].size()=%d.", name, name, - min_input_shape_[name].size(), + params_.min_input_shape[name].size(), name, input_shape.size())); - bak_min_shape = min_input_shape_[name]; - bak_max_shape = max_input_shape_[name]; + bak_min_shape = params_.min_input_shape[name]; + bak_max_shape = params_.max_input_shape[name]; for (size_t d = 0; d < input_shape.size(); ++d) { - if (input_shape[d] < min_input_shape_[name][d]) { + if (input_shape[d] < params_.min_input_shape[name][d]) { ret = true; min_change = true; - min_input_shape_[name][d] = input_shape[d]; + params_.min_input_shape[name][d] = input_shape[d]; } - if (input_shape[d] > max_input_shape_[name][d]) { + if (input_shape[d] > params_.max_input_shape[name][d]) { ret = true; max_change = true; - max_input_shape_[name][d] = input_shape[d]; + params_.max_input_shape[name][d] = input_shape[d]; } } } if (min_change) LOG(INFO) << "refactor tensor shape range: " << name << ", min_shape from " << Vec2Str(bak_min_shape) << " to " - << Vec2Str(min_input_shape_[name]); + << Vec2Str(params_.min_input_shape[name]); if (max_change) LOG(INFO) << "refactor tensor shape range: " << name << ", max_shape from " << Vec2Str(bak_max_shape) << " to " - << Vec2Str(max_input_shape_[name]); + << Vec2Str(params_.max_input_shape[name]); if (min_change || max_change) changed->push_back(name); } for (const auto& it : runtime_shape_tensor) { @@ -585,15 +443,15 @@ class TensorRTEngine { bool max_change = false; std::vector bak_min_shape; std::vector bak_max_shape; - if (!min_shape_tensor_.count(name)) { - min_shape_tensor_[name] = shape_tensor; - max_shape_tensor_[name] = shape_tensor; - optim_shape_tensor_[name] = shape_tensor; + if (!params_.min_shape_tensor.count(name)) { + params_.min_shape_tensor[name] = shape_tensor; + params_.max_shape_tensor[name] = shape_tensor; + params_.optim_shape_tensor[name] = shape_tensor; min_change = true; max_change = true; ret = true; } else { - PADDLE_ENFORCE_EQ(min_shape_tensor_[name].size(), + PADDLE_ENFORCE_EQ(params_.min_shape_tensor[name].size(), shape_tensor.size(), platform::errors::InvalidArgument( "TRT dynamic_shape min_shape_tensor %s size not " @@ -601,50 +459,50 @@ class TensorRTEngine { ", but the runtime_shape_tensor[%s].size()=%d.", name, name, - min_shape_tensor_[name].size(), + params_.min_shape_tensor[name].size(), name, shape_tensor.size())); - bak_min_shape = min_shape_tensor_[name]; - bak_max_shape = max_shape_tensor_[name]; + bak_min_shape = params_.min_shape_tensor[name]; + bak_max_shape = params_.max_shape_tensor[name]; for (size_t d = 0; d < shape_tensor.size(); ++d) { - if (shape_tensor[d] < min_shape_tensor_[name][d]) { + if (shape_tensor[d] < params_.min_shape_tensor[name][d]) { ret = true; min_change = true; - min_shape_tensor_[name][d] = shape_tensor[d]; + params_.min_shape_tensor[name][d] = shape_tensor[d]; } - if (shape_tensor[d] > max_shape_tensor_[name][d]) { + if (shape_tensor[d] > params_.max_shape_tensor[name][d]) { ret = true; max_change = true; - max_shape_tensor_[name][d] = shape_tensor[d]; + params_.max_shape_tensor[name][d] = shape_tensor[d]; } } } if (min_change) LOG(INFO) << "refactor shape tensor range: " << name << ", min_shape from " << Vec2Str(bak_min_shape) << " to " - << Vec2Str(min_shape_tensor_[name]); + << Vec2Str(params_.min_shape_tensor[name]); if (max_change) LOG(INFO) << "refactor shape tensor range: " << name << ", max_shape from " << Vec2Str(bak_max_shape) << " to " - << Vec2Str(max_shape_tensor_[name]); + << Vec2Str(params_.max_shape_tensor[name]); if (min_change || max_change) tensor_changed->push_back(name); } return ret; } - bool use_varseqlen() { return use_varseqlen_; } - bool with_ernie() { return with_ernie_; } - bool with_interleaved() { return with_interleaved_; } - std::string tensorrt_transformer_posid() { - return tensorrt_transformer_posid_; + bool use_varseqlen() { return params_.use_varseqlen; } + bool use_dla() { return params_.use_dla; } + bool with_interleaved() { return params_.with_interleaved; } + const std::string& tensorrt_transformer_posid() { + return params_.tensorrt_transformer_posid; } - std::string tensorrt_transformer_maskid() { - return tensorrt_transformer_maskid_; + const std::string& tensorrt_transformer_maskid() { + return params_.tensorrt_transformer_maskid; } - bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } - bool with_dynamic_shape() { return with_dynamic_shape_; } - phi::DataType precision() { return precision_; } + bool disable_trt_plugin_fp16() { return params_.disable_trt_plugin_fp16; } + bool with_dynamic_shape() { return params_.with_dynamic_shape; } + phi::DataType precision() { return params_.precision; } #if IS_TRT_VERSION_GE(6000) nvinfer1::IPluginV2Layer* AddDynamicPlugin( @@ -656,153 +514,50 @@ class TensorRTEngine { } #endif - bool Has(const std::string& attr_name) const { - return attrs_.count(attr_name) > 0; - } - - void Erase(const std::string& attr_name) { - if (!Has(attr_name)) { - return; - } - if (attr_dels_.find(attr_name) != attr_dels_.end()) { - attr_dels_[attr_name](); - attr_dels_.erase(attr_name); - } - attrs_.erase(attr_name); - } - - // Set a pointer to the attribute. Engine takes ownership of the attribute. - template - void Set(const std::string& attr_name, AttrType* attr) { - if (attrs_.count(attr_name) == 0) { - PADDLE_ENFORCE_EQ( - attrs_.count(attr_name), - 0, - platform::errors::AlreadyExists( - "Attribute %s already set in trt engine.", attr_name)); - } else { - VLOG(3) << "Setting the attribute " << attr_name << " for trt engine " - << this; - } - attrs_[attr_name] = attr; - attr_dels_[attr_name] = [attr, attr_name]() { - VLOG(3) << "deleting " << attr_name; - delete attr; - }; - } - - // Set a pointer to the attribute. Engine doesn't take ownership. Caller - // should delete the attribute. - template - void SetNotOwned(const std::string& attr_name, AttrType* attr) { - PADDLE_ENFORCE_EQ( - attrs_.count(attr_name), - 0, - platform::errors::AlreadyExists( - "Attribute %s already set in trt engine.", attr_name)); - attrs_[attr_name] = attr; - } - - // Get a reference to the attributed previously set. - template - AttrType& Get(const std::string& attr_name) const { - PADDLE_ENFORCE_NE(attrs_.find(attr_name), - attrs_.end(), - platform::errors::InvalidArgument( - "Attribute %s not found in trt engine.", attr_name)); - try { - return *paddle::any_cast(attrs_.at(attr_name)); - } catch (paddle::bad_any_cast&) { - auto TypeToString = [](const std::type_info& info) -> std::string { - if (std::type_index(info) == std::type_index(typeid(bool*))) { - return "bool"; - } else if (std::type_index(info) == std::type_index(typeid(int*))) { - return "int"; - } else if (std::type_index(info) == - std::type_index(typeid(const int*))) { - return "const int"; - } else if (std::type_index(info) == - std::type_index(typeid(std::string*))) { - return "std::string"; - } - return info.name(); - }; - - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid type for attritube %s, expected: %s, actual: %s.", - attr_name, - TypeToString(typeid(AttrType*)), - TypeToString(attrs_.at(attr_name).type()))); - } - } - void SetProfileNum(int num) { max_profile_num_ = num; } - void GetEngineInfo(); - - void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; } - void SetScope(const framework::Scope& scope) { scope_ = &scope; } - - void SetContextMemorySharing(bool context_memory_sharing) { - context_memory_sharing_ = context_memory_sharing; - } - - void SetLowPrecisionIO(bool low_precision_io) { - low_precision_io_ = low_precision_io; - } - - bool EnableLowPrecisionIO() const { return low_precision_io_; } + void SetScope(const framework::Scope* scope) { scope_ = scope; } void SetAllNodesLowerToTrt(bool all_nodes_offload_to_trt) { // all nodes are in trt, so we can use cudaGraph to optimize runtime. startup_with_cudagraph_ = all_nodes_offload_to_trt; } + bool LowPrecisionIOEnabled() const { return params_.enable_low_precision_io; } + private: // Each ICudaEngine object is bound to a specific GPU when it is instantiated, // ensure that the thread is associated with the correct device by calling - // freshDeviceId(). - void freshDeviceId(); - // Used for convert weight into Itensor - const framework::Scope* scope_; + // FreshDeviceId(). + void FreshDeviceId(); - // the max batch size - int max_batch_; - // the runtime batch size - static int runtime_batch_; - // the max memory size the engine uses - int64_t max_workspace_; + void GetEngineInfo(); - phi::DataType precision_; - TRTInt8Calibrator* calibrator_; - // batch size of the current data, will be updated each Executation. - int batch_size_{-1}; + int device_id() { return params_.device_id; } - // use for engine context memory sharing - bool context_memory_sharing_{false}; + int GetProfileIndex() { + if (max_profile_num_ > 1) { + std::unique_lock lock(mutex_); + return profile_index_[predictor_id_per_thread]; + } else { + return 0; + } + } - bool low_precision_io_{false}; + private: + // + // Construction parameters. + // + ConstructionParams params_; + + // + // The following are runtime parameters. + // - int device_id_; int max_profile_num_{1}; int cur_profile_num_{0}; std::unordered_map profile_index_; - bool with_dynamic_shape_{false}; - ShapeMapType min_input_shape_; - ShapeMapType max_input_shape_; - ShapeMapType optim_input_shape_; - ShapeMapType min_shape_tensor_; - ShapeMapType max_shape_tensor_; - ShapeMapType optim_shape_tensor_; - bool disable_trt_plugin_fp16_{false}; - phi::DataType model_precision_{phi::DataType::FLOAT32}; - bool use_varseqlen_{false}; - bool use_dla_{false}; - int dla_core_{0}; - bool with_ernie_{false}; - bool with_interleaved_{false}; - std::string tensorrt_transformer_posid_; - std::string tensorrt_transformer_maskid_; + nvinfer1::ILogger& logger_; // max data size for the buffers. @@ -828,8 +583,9 @@ class TensorRTEngine { bool cudagraph_inited_{false}; bool startup_with_cudagraph_{false}; - std::unordered_map attrs_; - std::unordered_map> attr_dels_; + // Used for convert weight into Itensor + const framework::Scope* scope_{nullptr}; + #if IS_TRT_VERSION_GE(6000) int binding_num_; infer_ptr infer_builder_config_; @@ -837,7 +593,6 @@ class TensorRTEngine { std::vector> owned_pluginv2_; #endif std::mutex mutex_; - bool use_inspector_; public: thread_local static int predictor_id_per_thread; @@ -872,7 +627,7 @@ class TRTEngineManager { bool Empty() const { std::lock_guard lock(mutex_); - return engines_.size() == 0; + return engines_.empty(); } bool Has(const std::string& name) const { @@ -886,41 +641,13 @@ class TRTEngineManager { return engines_.at(name).get(); } - TensorRTEngine* Create( - std::string name, - int max_batch, - int64_t max_workspace, - phi::DataType precision = phi::DataType::FLOAT32, - TRTInt8Calibrator* calibrator = nullptr, - int device_id = 0, - bool with_dynamic_shape = false, - const std::map> min_input_shape = {}, - const std::map> max_input_shape = {}, - const std::map> optim_input_shape = {}, - const std::map> min_shape_tensor = {}, - const std::map> max_shape_tensor = {}, - const std::map> optim_shape_tensor = {}, - bool disable_trt_plugin_fp16 = false, - phi::DataType model_precision = phi::DataType::FLOAT32, - nvinfer1::ILogger& logger = NaiveLogger::Global()) { - auto* p = new TensorRTEngine(max_batch, - max_workspace, - precision, - calibrator, - device_id, - with_dynamic_shape, - min_input_shape, - max_input_shape, - optim_input_shape, - min_shape_tensor, - max_shape_tensor, - optim_shape_tensor, - disable_trt_plugin_fp16, - model_precision, - logger); + TensorRTEngine* Create(const std::string& name, + const TensorRTEngine::ConstructionParams& params, + nvinfer1::ILogger& logger = NaiveLogger::Global()) { + auto engine = std::make_unique(params, logger); std::lock_guard lock(mutex_); - engines_[name].reset(p); - return p; + engines_[name].reset(engine.release()); + return engines_[name].get(); } void DeleteAll() { @@ -940,7 +667,7 @@ class TRTEngineManager { } } - void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) { + void UpdateContextMemorySize(size_t mem_size, PredictorID predictor_id) { VLOG(3) << "TensorRT engine context memory size is " << mem_size / 1024.0 / 1024.0 << "MiB in predictor id " << predictor_id; @@ -955,24 +682,24 @@ class TRTEngineManager { } if (size_updated) { - releaseContextMemory(predictor_id); + ReleaseContextMemory(predictor_id); } } - void* getContextMemory(PredictorID predictor_id, + void* GetContextMemory(PredictorID predictor_id, const phi::GPUPlace& place, const phi::Stream& stream) { std::lock_guard lock(mutex_); - static auto alignment = getAlignmentSize(place); + static auto alignment = GetAlignmentSize(place); if (context_memorys_.count(predictor_id) == 0) { auto context_memory = memory::Alloc(place, max_ctx_mem_size_ + alignment, stream); context_memorys_[predictor_id] = std::move(context_memory); } - return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment); + return GetAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment); } - void releaseContextMemory(PredictorID predictor_id) { + void ReleaseContextMemory(PredictorID predictor_id) { std::lock_guard lock(mutex_); if (context_memorys_.count(predictor_id)) { context_memorys_[predictor_id].reset(nullptr); @@ -981,12 +708,12 @@ class TRTEngineManager { } private: - size_t getAlignmentSize(const phi::GPUPlace& place) { + size_t GetAlignmentSize(const phi::GPUPlace& place) { const auto& prop = platform::GetDeviceProperties(place.GetDeviceId()); return prop.textureAlignment; } - void* getAlignedMemory(void* addr, size_t alignment) { + void* GetAlignedMemory(void* addr, size_t alignment) { return reinterpret_cast(uintptr_t(addr) & (~(alignment - 1))); } diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index e744b261cbcfada9d4321ba4c412f8938be190f9..3f0152b2909c0826d3c41daddd5af96ab1fd3b95 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -148,7 +148,7 @@ class NaiveProfiler : public nvinfer1::IProfiler { typedef std::pair Record; std::vector mProfile; - virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT { + void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT override { auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r) { return r.first == layerName; @@ -235,6 +235,130 @@ static inline nvinfer1::DataType PhiType2NvType(phi::DataType type) { return nv_type; } +using FluidDT = paddle::framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; +static TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + case FluidDT::VarType_Type_FP64: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + case FluidDT::VarType_Type_INT64: + return TRT_DT::kINT32; + case FluidDT::VarType_Type_FP16: + return TRT_DT::kHALF; +#if IS_TRT_VERSION_GE(8400) + case FluidDT::VarType_Type_BOOL: + return TRT_DT::kBOOL; + +#endif + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "unsupported datatype in TRT op converter, type: %s. " + "Boolean type is supported as TRT input/output " + "using TensorRT v8.4+.", + VarType_Type_Name(type))); + } + return TRT_DT::kINT32; +} + +// The T can be int32 or int64 type. +template +static nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, + std::string input, + bool with_dynamic_shape = false) { + PADDLE_ENFORCE_GE(shape.size(), + 0UL, + paddle::platform::errors::InvalidArgument( + "TensorRT's tensor input requires at least 0 " + "dimensions, but input %s has %d dims.", + input, + shape.size())); + + auto ShapeStr = [](const std::vector& shape) { + std::ostringstream os; + os << "["; + for (size_t i = 0; i < shape.size(); ++i) { + if (i == shape.size() - 1) { + os << shape[i]; + } else { + os << shape[i] << ","; + } + } + os << "]"; + return os.str(); + }; + if (!with_dynamic_shape) { + if (shape.size() == 4UL) { + if (shape[2] == -1 || shape[3] == -1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s, please enable " + "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", + input, + ShapeStr(shape))); + } + return nvinfer1::Dims3(shape[1], shape[2], shape[3]); + } else if (shape.size() == 5UL) { + if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s, please enable " + "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", + input, + ShapeStr(shape))); + } + return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]); + } else if (shape.size() == 3UL) { + if (shape[1] == -1 || shape[2] == -1) { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s, please enable " + "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", + input, + ShapeStr(shape))); + } + return nvinfer1::Dims2(shape[1], shape[2]); + } else if (shape.size() == 2UL) { + if (shape[1] == -1) { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s, please enable " + "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", + input, + ShapeStr(shape))); + } + nvinfer1::Dims dims; + dims.nbDims = 1; + dims.d[0] = shape[1]; + return dims; + } + // static shape doesn't support 1D op so far. + PADDLE_ENFORCE_NE(shape.size(), + 1UL, + paddle::platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s." + "it's not supported by trt so far", + input, + ShapeStr(shape))); + + nvinfer1::Dims dims; + dims.nbDims = shape.size() - 1; + for (size_t i = 1; i < shape.size(); i++) { + dims.d[i - 1] = shape[i]; + } + return dims; + } else { + if (shape.size() == 4UL) { + return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); + } else if (shape.size() == 3UL) { + return nvinfer1::Dims3(shape[0], shape[1], shape[2]); + } + nvinfer1::Dims dims; + dims.nbDims = shape.size(); + for (size_t i = 0; i < shape.size(); i++) { + dims.d[i] = shape[i]; + } + return dims; + } +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index fcc4f2bfcf7e245dccb6883e7dd834e6459271ea..8d2eb4cb4919b4372be3787fcddb05524afe7ac9 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/layout.h" #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h" #endif @@ -35,7 +36,7 @@ namespace tensorrt { class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); + ctx_ = std::make_unique(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -65,29 +66,21 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { {"shape", {18, 8, 4}}}; std::map> optim_input_value = { {"shape", {18, 8, 4}}}; - engine_ = new TensorRTEngine(16, - 1 << 10, - phi::DataType::FLOAT32, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape, - min_input_value, - max_input_value, - optim_input_value, - false, - phi::DataType::FLOAT32, - NaiveLogger::Global()); - engine_->InitNetwork(); - } - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 16; + params.max_workspace_size = 1 << 10; + params.with_dynamic_shape = true; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; + params.min_shape_tensor = min_input_value; + params.max_shape_tensor = max_input_value; + params.optim_shape_tensor = optim_input_value; + + engine_ = std::make_unique(params, NaiveLogger::Global()); + + engine_->InitNetwork(); } void PrepareInputOutput(const std::vector &input, @@ -106,8 +99,8 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { phi::DenseTensor input_; phi::DenseTensor shape_; phi::DenseTensor output_; - TensorRTEngine *engine_; - phi::GPUContext *ctx_; + std::unique_ptr engine_; + std::unique_ptr ctx_; }; TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { @@ -167,7 +160,7 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { class TensorRTDynamicEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); + ctx_ = std::make_unique(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -192,29 +185,18 @@ class TensorRTDynamicEngineTest : public ::testing::Test { std::map> optim_input_shape = { {"input", {16, 32, 1, 1}}}; - engine_ = new TensorRTEngine(16, - 1 << 10, - phi::DataType::FLOAT16, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape, - std::map>(), - std::map>(), - std::map>(), - false, - phi::DataType::FLOAT32, - NaiveLogger::Global()); - engine_->InitNetwork(); - } + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 16; + params.max_workspace_size = 1 << 10; + params.with_dynamic_shape = true; + params.precision = phi::DataType::FLOAT16; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } + engine_ = std::make_unique(params, NaiveLogger::Global()); + + engine_->InitNetwork(); } void PrepareInputOutput(const std::vector &input, @@ -230,8 +212,8 @@ class TensorRTDynamicEngineTest : public ::testing::Test { protected: phi::DenseTensor input_; phi::DenseTensor output_; - TensorRTEngine *engine_; - phi::GPUContext *ctx_; + std::unique_ptr engine_; + std::unique_ptr ctx_; }; TEST_F(TensorRTDynamicEngineTest, test_spmm) { @@ -336,7 +318,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { protected: void SetUp() override { - ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); + ctx_ = std::make_unique(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -370,29 +352,18 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { {"mask", {4, 1, 4, 4}}, {"new_mask", {4, 1, 2, 2}}}; - engine_ = new TensorRTEngine(16, - 1 << 10, - phi::DataType::FLOAT32, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape, - std::map>(), - std::map>(), - std::map>(), - false, - phi::DataType::FLOAT32, - NaiveLogger::Global()); - engine_->InitNetwork(); - } + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 16; + params.max_workspace_size = 1 << 10; + params.precision = phi::DataType::FLOAT32; + params.with_dynamic_shape = true; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } + engine_ = std::make_unique(params, NaiveLogger::Global()); + + engine_->InitNetwork(); } void PrepareInputOutput(const std::vector> inputs, @@ -419,13 +390,12 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { protected: std::vector inputs_; std::vector outputs_; - TensorRTEngine *engine_; - phi::GPUContext *ctx_; + std::unique_ptr engine_; + std::unique_ptr ctx_; }; TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { #if IS_TRT_VERSION_GE(8000) - tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt(); auto *attn = engine_->DeclareInput( "attn", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 4}); auto *x = engine_->DeclareInput( @@ -545,7 +515,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { protected: void SetUp() override { - ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); + ctx_ = std::make_unique(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -579,29 +549,17 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { {"mask", {4, 1, 4, 4}}, {"new_mask", {4, 1, 2, 2}}}; - engine_ = new TensorRTEngine(16, - 1 << 10, - phi::DataType::FLOAT16, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape, - std::map>(), - std::map>(), - std::map>(), - false, - phi::DataType::FLOAT32, - NaiveLogger::Global()); - engine_->InitNetwork(); - } + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 16; + params.max_workspace_size = 1 << 10; + params.precision = phi::DataType::FLOAT16; + params.with_dynamic_shape = true; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } + engine_ = std::make_unique(params, NaiveLogger::Global()); + engine_->InitNetwork(); } void PrepareInputOutput(const std::vector> inputs, @@ -628,13 +586,12 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { protected: std::vector inputs_; std::vector outputs_; - TensorRTEngine *engine_; - phi::GPUContext *ctx_; + std::unique_ptr engine_; + std::unique_ptr ctx_; }; TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) { #if IS_TRT_VERSION_GE(8000) - tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt(); auto *attn = engine_->DeclareInput( "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims2{-1, 4}); auto *x = engine_->DeclareInput( @@ -754,7 +711,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) { class TensorRTDynamicShapeGNTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); + ctx_ = std::make_unique(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -782,29 +739,18 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { std::map> max_input_value = {}; std::map> optim_input_value = {}; - engine_ = new TensorRTEngine(16, - 1 << 10, - phi::DataType::INT8, - nullptr, - 0, - true, - min_input_shape, - max_input_shape, - optim_input_shape, - min_input_value, - max_input_value, - optim_input_value, - false, - phi::DataType::FLOAT32, - NaiveLogger::Global()); - engine_->InitNetwork(); - } + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 16; + params.max_workspace_size = 1 << 10; + params.precision = phi::DataType::INT8; + params.with_dynamic_shape = true; + params.min_input_shape = min_input_shape; + params.max_input_shape = max_input_shape; + params.optim_input_shape = optim_input_shape; - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } + engine_ = std::make_unique(params, NaiveLogger::Global()); + + engine_->InitNetwork(); } void PrepareInputOutput(const std::vector &input, @@ -923,8 +869,8 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { protected: phi::DenseTensor x_; phi::DenseTensor y_; - TensorRTEngine *engine_; - phi::GPUContext *ctx_; + std::unique_ptr engine_; + std::unique_ptr ctx_; // case from SD int n_ = 2; int c_ = 320; @@ -942,8 +888,6 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { /* TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) { - tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt(); - float *bias = new float[c_]; float *scale = new float[c_]; for (int i = 0; i < c_; i++) { diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 23a0df7316724a9ab6eece77a73a42e1199839d4..256af16fb155fc57313c7f5a84c9800bab09ec93 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/engine.h" @@ -48,17 +49,13 @@ class TensorRTEngineTest : public ::testing::Test { .get()); ctx_->PartialInitWithAllocator(); - engine_ = new TensorRTEngine(10, 1 << 10); + TensorRTEngine::ConstructionParams params; + params.max_batch_size = 10; + params.max_workspace_size = 1 << 10; + engine_ = std::make_unique(params); engine_->InitNetwork(); } - void TearDown() override { - if (engine_) { - delete engine_; - engine_ = nullptr; - } - } - void PrepareInputOutput(const std::vector &input, std::vector output_shape) { paddle::framework::TensorFromVector(input, *ctx_, &input_); @@ -72,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test { protected: phi::DenseTensor input_; phi::DenseTensor output_; - TensorRTEngine *engine_; + std::unique_ptr engine_; phi::GPUContext *ctx_; }; @@ -111,15 +108,6 @@ TEST_F(TensorRTEngineTest, add_layer) { buffers[0] = reinterpret_cast(x_v_gpu_data); buffers[1] = reinterpret_cast(y_gpu_data); - LOG(INFO) << "Set attr"; - engine_->Set("test_attr", new std::string("test_attr")); - if (engine_->Has("test_attr")) { - auto attr_val = engine_->Get("test_attr"); - engine_->Erase("test_attr"); - } - std::string *attr_key = new std::string("attr_key"); - engine_->SetNotOwned("attr1", attr_key); - LOG(INFO) << "to execute"; engine_->Execute(1, &buffers, ctx_->stream()); @@ -128,8 +116,6 @@ TEST_F(TensorRTEngineTest, add_layer) { LOG(INFO) << "to checkout output"; ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); - - delete attr_key; } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index 0c80316f0003c7d5f39a68fad530587f5781c777..82bb7a64168b4baa4efd73c4e486b78a258f6c83 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -43,7 +43,7 @@ class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { const platform::Place place); explicit TRTInt8Calibrator(const std::string& calibration_data); - ~TRTInt8Calibrator(); + ~TRTInt8Calibrator() override; int getBatchSize() const TRT_NOEXCEPT override; @@ -91,7 +91,7 @@ class TRTCalibratorEngine { */ class TRTCalibratorEngineManager { public: - bool Has() const { return res_.size() > 0; } + bool Has() const { return !res_.empty(); } bool Has(const std::string& name) const { if (res_.count(name) == 0) return false; return res_.at(name).get() != nullptr; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 395ec613db8f4af83665bd7c70510506e48f28d6..3ebf16410fe6cd9d929801d422814b5f5c0f50a8 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -14,30 +14,34 @@ #pragma once -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/data_type_transform.h" #ifdef PADDLE_WITH_CUDA +#include #include #include #include #include #include #include -#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/fluid/framework/data_device_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" #include "paddle/utils/string/string_helper.h" namespace paddle { @@ -171,10 +175,6 @@ class TensorRTEngineOp : public framework::OperatorBase { std::string model_opt_cache_dir_; bool use_static_engine_; phi::DataType precision_mode_; - std::map> min_input_shape_{}; - std::map> max_input_shape_{}; - std::map> opt_input_shape_{}; - phi::DataType model_precision_{phi::DataType::FLOAT32}; public: TensorRTEngineOp(const std::string &type, @@ -185,7 +185,7 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); - device_id_ = Attr("gpu_id"); + device_id_ = Attr("gpu_device_id"); enable_int8_ = Attr("enable_int8"); enable_fp16_ = Attr("enable_fp16"); use_calib_mode_ = Attr("use_calib_mode"); @@ -200,43 +200,6 @@ class TensorRTEngineOp : public framework::OperatorBase { if (use_static_engine_) { model_opt_cache_dir_ = Attr("model_opt_cache_dir"); } - model_precision_ = static_cast(Attr("model_precision")); - - if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") && - HasAttr("max_input_shape") && HasAttr("opt_input_shape")) { - std::vector dynamic_shape_names; - std::vector> min_input_shapes; - std::vector> max_input_shapes; - std::vector> opt_input_shapes; - std::vector dynamic_shape_lens; - dynamic_shape_names = - Attr>("dynamic_shape_names"); - std::vector min_shapes = Attr>("min_input_shape"); - std::vector max_shapes = Attr>("max_input_shape"); - std::vector opt_shapes = Attr>("opt_input_shape"); - dynamic_shape_lens = Attr>("dynamic_shape_lens"); - int idx = 0; - for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) { - std::vector tmp1, tmp2, tmp3; - for (int j = 0; j < dynamic_shape_lens[i]; ++j) { - tmp1.push_back(min_shapes[idx]); - tmp2.push_back(max_shapes[idx]); - tmp3.push_back(opt_shapes[idx++]); - } - min_input_shapes.emplace_back(tmp1); - max_input_shapes.emplace_back(tmp2); - opt_input_shapes.emplace_back(tmp3); - } - - for (size_t i = 0; i < dynamic_shape_names.size(); ++i) { - min_input_shape_.insert( - std::make_pair(dynamic_shape_names[i], min_input_shapes[i])); - max_input_shape_.insert( - std::make_pair(dynamic_shape_names[i], max_input_shapes[i])); - opt_input_shape_.insert( - std::make_pair(dynamic_shape_names[i], opt_input_shapes[i])); - } - } auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -249,11 +212,11 @@ class TensorRTEngineOp : public framework::OperatorBase { // calibration_mode is true represents we need to // generate the calibration table data. calibration_mode_ = - (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_); + (enable_int8_ && calibration_data_.empty() && use_calib_mode_); VLOG(4) << "calibration_mode: " << calibration_mode_; - if (enable_int8_ && calibration_data_.size()) { - calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); + if (enable_int8_ && !calibration_data_.empty()) { + calibrator_ = std::make_unique(calibration_data_); } bool has_engine = inference::Singleton::Global() @@ -486,36 +449,29 @@ class TensorRTEngineOp : public framework::OperatorBase { auto t_shape = phi::vectorize(t.dims()); runtime_batch = t_shape[0]; } - calib_res->calib_.reset(new TRTInt8Calibrator( - calib_buffers, runtime_batch, calibration_engine_key_, dev_place)); + calib_res->calib_ = std::make_unique( + calib_buffers, runtime_batch, calibration_engine_key_, dev_place); calib_res->thr_.reset(new std::thread([&]() { - std::map> min_input_shape; - std::map> max_input_shape; - std::map> opt_input_shape; - std::map> min_shape_tensor; - std::map> max_shape_tensor; - std::map> opt_shape_tensor; - if (shape_range_info_path_.size()) + TensorRTEngine::ConstructionParams params; + params.max_batch_size = max_batch_size_; + params.max_workspace_size = workspace_size_; + params.precision = precision_mode_; + params.calibrator = calib_res->calib_.get(); + params.device_id = dev_place.device; + params.with_dynamic_shape = with_dynamic_shape_; + if (!shape_range_info_path_.empty()) { inference::DeserializeShapeRangeInfo(shape_range_info_path_, - &min_input_shape, - &max_input_shape, - &opt_input_shape, - &min_shape_tensor, - &max_shape_tensor, - &opt_shape_tensor); - - calib_res->engine_.reset(new TensorRTEngine(max_batch_size_, - workspace_size_, - precision_mode_, - calib_res->calib_.get(), - dev_place.device, - with_dynamic_shape_, - min_input_shape, - max_input_shape, - opt_input_shape, - min_shape_tensor, - max_shape_tensor, - opt_shape_tensor)); + ¶ms.min_input_shape, + ¶ms.max_input_shape, + ¶ms.optim_input_shape, + ¶ms.min_shape_tensor, + ¶ms.max_shape_tensor, + ¶ms.optim_shape_tensor); + } + params.context_memory_sharing = Attr("context_memory_sharing"); + params.enable_low_precision_io = Attr("enable_low_precision_io"); + calib_res->engine_ = std::make_unique(params); + VLOG(3) << "start the calib trt engine thread"; PrepareTRTEngine(scope, calib_res->engine_.get()); })); @@ -597,7 +553,7 @@ class TensorRTEngineOp : public framework::OperatorBase { // This must be a zero dimension tensor. // At present, we convert it to a 1D tensor to feed them into Trt. - if (t_shape.size() == 0) { + if (t_shape.empty()) { PADDLE_ENFORCE_EQ( t.numel(), 1UL, @@ -669,8 +625,12 @@ class TensorRTEngineOp : public framework::OperatorBase { t.numel() * sizeof(int), nullptr); } else if (t.dtype() == phi::DataType::INT64) { - auto int32_tensor = scope.FindVar(x + "_cast_to_INT32") - ->GetMutable(); + std::string x_t = x + "_cast_to_INT32"; + if (scope.FindVar(x_t) == nullptr) { + const_cast(&scope)->Var(x_t); + } + auto int32_tensor = + scope.FindVar(x_t)->GetMutable(); *int32_tensor = phi::Cast( reinterpret_cast(dev_ctx), t, @@ -703,16 +663,22 @@ class TensorRTEngineOp : public framework::OperatorBase { if (t.dtype() == phi::DataType::FLOAT32) { buffers[bind_index] = static_cast(t.data()); } else if (t.dtype() == phi::DataType::FLOAT64) { - auto fp32_tensor = - scope.FindVar(x + "_cast_to_FP32")->GetMutable(); + std::string x_t = x + "_cast_to_FP32"; + if (scope.FindVar(x_t) == nullptr) { + const_cast(&scope)->Var(x_t); + } + auto fp32_tensor = scope.FindVar(x_t)->GetMutable(); *fp32_tensor = phi::Cast( reinterpret_cast(dev_ctx), t, phi::DataType::FLOAT32); buffers[bind_index] = static_cast(fp32_tensor->data()); } else if (t.dtype() == phi::DataType::INT64) { - auto int32_tensor = - scope.FindVar(x + "_cast_to_INT32")->GetMutable(); + std::string x_t = x + "_cast_to_INT32"; + if (scope.FindVar(x_t) == nullptr) { + const_cast(&scope)->Var(x_t); + } + auto int32_tensor = scope.FindVar(x_t)->GetMutable(); *int32_tensor = phi::Cast( reinterpret_cast(dev_ctx), t, @@ -827,8 +793,11 @@ class TensorRTEngineOp : public framework::OperatorBase { auto y = Outputs("Ys")[i]; auto *fluid_v = scope.FindVar(y); auto *fluid_t = fluid_v->GetMutable(); - auto int32_tensor = - scope.FindVar(y + "_cast_to_INT64")->GetMutable(); + std::string y_t = y + "_cast_to_INT64"; + if (scope.FindVar(y_t) == nullptr) { + const_cast(&scope)->Var(y_t); + } + auto int32_tensor = scope.FindVar(y_t)->GetMutable(); int32_tensor->Resize(fluid_t->dims()); dev_ctx.Alloc(int32_tensor); framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor); @@ -840,8 +809,11 @@ class TensorRTEngineOp : public framework::OperatorBase { auto y = Outputs("Ys")[i]; auto *fluid_v = scope.FindVar(y); auto *fluid_t = fluid_v->GetMutable(); - auto fp32_tensor = - scope.FindVar(y + "_cast_to_FP64")->GetMutable(); + std::string y_t = y + "_cast_to_FP64"; + if (scope.FindVar(y_t) == nullptr) { + const_cast(&scope)->Var(y_t); + } + auto fp32_tensor = scope.FindVar(y_t)->GetMutable(); fp32_tensor->Resize(fluid_t->dims()); dev_ctx.Alloc(fp32_tensor); framework::TensorCopy(*fluid_t, dev_place, dev_ctx, fp32_tensor); @@ -856,20 +828,92 @@ class TensorRTEngineOp : public framework::OperatorBase { TensorRTEngine *GetEngine(const framework::Scope &scope, const platform::Place &dev_place) const { if (!trt_engine_) { + TensorRTEngine::ConstructionParams params; + params.max_batch_size = max_batch_size_; + params.max_workspace_size = workspace_size_; + params.precision = precision_mode_; + params.calibrator = calibrator_.get(); + params.device_id = dev_place.device; + params.with_dynamic_shape = with_dynamic_shape_; + params.context_memory_sharing = Attr("context_memory_sharing"); + params.use_dla = Attr("use_dla"); + params.dla_core = Attr("dla_core"); + params.disable_trt_plugin_fp16 = Attr("disable_trt_plugin_fp16"); + params.enable_low_precision_io = Attr("enable_low_precision_io"); + params.use_inspector = Attr("use_inspector"); + + if (!shape_range_info_path_.empty()) { + inference::DeserializeShapeRangeInfo(shape_range_info_path_, + ¶ms.min_input_shape, + ¶ms.max_input_shape, + ¶ms.optim_input_shape, + ¶ms.min_shape_tensor, + ¶ms.max_shape_tensor, + ¶ms.optim_shape_tensor); + } else { + if (HasAttr("dynamic_shape_names") && + HasAttr("min_input_shape_vector") && + HasAttr("max_input_shape_vector") && + HasAttr("opt_input_shape_vector")) { + std::vector dynamic_shape_names; + std::vector> min_input_shapes; + std::vector> max_input_shapes; + std::vector> opt_input_shapes; + std::vector dynamic_shape_lens; + dynamic_shape_names = + Attr>("dynamic_shape_names"); + std::vector min_shapes = + Attr>("min_input_shape_vector"); + std::vector max_shapes = + Attr>("max_input_shape_vector"); + std::vector opt_shapes = + Attr>("opt_input_shape_vector"); + dynamic_shape_lens = Attr>("dynamic_shape_lens"); + int idx = 0; + for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) { + std::vector tmp1, tmp2, tmp3; + for (int j = 0; j < dynamic_shape_lens[i]; ++j) { + tmp1.push_back(min_shapes[idx]); + tmp2.push_back(max_shapes[idx]); + tmp3.push_back(opt_shapes[idx++]); + } + min_input_shapes.emplace_back(tmp1); + max_input_shapes.emplace_back(tmp2); + opt_input_shapes.emplace_back(tmp3); + } + + for (size_t i = 0; i < dynamic_shape_names.size(); ++i) { + params.min_input_shape.insert( + std::make_pair(dynamic_shape_names[i], min_input_shapes[i])); + params.max_input_shape.insert( + std::make_pair(dynamic_shape_names[i], max_input_shapes[i])); + params.optim_input_shape.insert( + std::make_pair(dynamic_shape_names[i], opt_input_shapes[i])); + } + } + } + trt_engine_ = inference::Singleton::Global() - .Create(engine_key_ + std::to_string(predictor_id_), - max_batch_size_, - workspace_size_, - precision_mode_, - calibrator_.get(), - device_id_, - with_dynamic_shape_, - min_input_shape_, - max_input_shape_, - opt_input_shape_); - PrepareTRTEngine(scope, trt_engine_); + .Create(engine_key_ + std::to_string(predictor_id_), params); + + if (use_static_engine_) { + LOG(INFO) << "Load TRT Optimized Info from " + << inference::analysis::GetTrtEngineSerializedPath( + model_opt_cache_dir_, engine_key_); + std::string trt_engine_serialized_data = + inference::analysis::GetTrtEngineSerializedData( + model_opt_cache_dir_, engine_key_); + trt_engine_->Deserialize(trt_engine_serialized_data); + } else { + // This brach mainly used to ut. + PrepareTRTEngine(scope, trt_engine_); + } } + PADDLE_ENFORCE_NOT_NULL( + trt_engine_, + platform::errors::Fatal( + "The pointer to tensorrt engine should not be null.")); return trt_engine_; } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 3d96361d89f043da5a6acff7ad1ed96e20bac583..f6c20061de001076b82a240fe10e463871bf82a3 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -113,7 +113,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { ASSERT_EQ(block_->ops_size(), 2); - LOG(INFO) << "create tensorrt desc"; + LOG(INFO) << "create tensorrt op desc"; framework::OpDesc engine_op_desc(nullptr); engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); @@ -138,19 +138,18 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("engine_serialized_data", std::string("")); int device_id = 0; - engine_op_desc.SetAttr("gpu_id", device_id); + engine_op_desc.SetAttr("gpu_device_id", device_id); engine_op_desc.SetAttr("shape_range_info_path", std::string("")); engine_op_desc.SetAttr("model_opt_cache_dir", std::string("")); engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime); - engine_op_desc.SetAttr("use_static_engine", true); - engine_op_desc.SetAttr("dynamic_shape_names", std::vector{"x"}); - engine_op_desc.SetAttr("dynamic_shape_lens", std::vector{4}); - engine_op_desc.SetAttr("with_dynamic_shape", true); - engine_op_desc.SetAttr("min_input_shape", std::vector{1, 1, 1, 1}); - engine_op_desc.SetAttr("max_input_shape", std::vector{16, 16, 16, 16}); - engine_op_desc.SetAttr("opt_input_shape", std::vector{2, 4, 4, 4}); - engine_op_desc.SetAttr("model_precision", - static_cast(phi::DataType::FLOAT32)); + engine_op_desc.SetAttr("use_static_engine", false); + engine_op_desc.SetAttr("with_dynamic_shape", false); + engine_op_desc.SetAttr("context_memory_sharing", true); + engine_op_desc.SetAttr("disable_trt_plugin_fp16", false); + engine_op_desc.SetAttr("enable_low_precision_io", false); + engine_op_desc.SetAttr("use_inspector", false); + engine_op_desc.SetAttr("use_dla", false); + engine_op_desc.SetAttr("dla_core", 0); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -263,7 +262,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { {output_dim, output_dim}, {batch_size, output_dim}); - LOG(INFO) << "create tensorrt desc"; + LOG(INFO) << "create tensorrt op desc"; framework::OpDesc engine_op_desc(nullptr); engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x0"})); @@ -288,11 +287,18 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("engine_serialized_data", std::string("")); int device_id = 0; - engine_op_desc.SetAttr("gpu_id", device_id); + engine_op_desc.SetAttr("gpu_device_id", device_id); engine_op_desc.SetAttr("shape_range_info_path", std::string("")); engine_op_desc.SetAttr("model_opt_cache_dir", std::string("")); engine_op_desc.SetAttr("allow_build_at_runtime", false); engine_op_desc.SetAttr("use_static_engine", false); + engine_op_desc.SetAttr("with_dynamic_shape", false); + engine_op_desc.SetAttr("context_memory_sharing", true); + engine_op_desc.SetAttr("disable_trt_plugin_fp16", false); + engine_op_desc.SetAttr("enable_low_precision_io", false); + engine_op_desc.SetAttr("use_inspector", false); + engine_op_desc.SetAttr("use_dla", false); + engine_op_desc.SetAttr("dla_core", 0); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt index 1e7fcfa6a100508d64660bbb78184ea89f2ea13e..d010e9545758c10f4de6b1e026b53b9419bcb29b 100755 --- a/test/ir/inference/CMakeLists.txt +++ b/test/ir/inference/CMakeLists.txt @@ -174,6 +174,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300) + set_tests_properties(test_save_optimized_model_pass PROPERTIES TIMEOUT 300) if(WITH_NV_JETSON) set_tests_properties( diff --git a/test/ir/inference/test_save_optimized_model_pass.py b/test/ir/inference/test_save_optimized_model_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5a52285a5ee14e4acfb9793b87f039dc060778 --- /dev/null +++ b/test/ir/inference/test_save_optimized_model_pass.py @@ -0,0 +1,160 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import numpy as np + +import paddle +from paddle.inference import Config, PrecisionType, create_predictor +from paddle.jit import to_static +from paddle.static import InputSpec +from paddle.vision.models import alexnet + + +class TestSaveOptimizedModelPass: + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + net = alexnet(True) + model = to_static( + net, input_spec=[InputSpec(shape=[None, 3, 224, 224], name='x')] + ) + paddle.jit.save( + model, os.path.join(self.temp_dir.name, 'alexnet/inference') + ) + + def tearDown(self): + self.temp_dir.cleanup() + + def get_baseline(self): + predictor = self.init_predictor(save_optimized_model=True) + + inputs = [ + paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32)) + ] + + outputs = predictor.run(inputs) + + return outputs[0] + + def get_test_output(self): + predictor = self.init_predictor(save_optimized_model=False) + + inputs = [ + paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32)) + ] + + outputs = predictor.run(inputs) + + return outputs[0] + + def test_output(self): + if paddle.is_compiled_with_cuda(): + baseline = self.get_baseline() + test_output = self.get_test_output() + + np.testing.assert_allclose( + baseline.numpy().flatten(), + test_output.numpy().flatten(), + ) + + +class TestSaveOptimizedModelPassWithGPU( + TestSaveOptimizedModelPass, unittest.TestCase +): + def init_predictor(self, save_optimized_model: bool): + if save_optimized_model is True: + config = Config( + os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'), + os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'), + ) + config.enable_use_gpu(256, 0, PrecisionType.Half) + config.enable_memory_optim() + config.switch_ir_optim(True) + config.set_optim_cache_dir( + os.path.join(self.temp_dir.name, 'alexnet') + ) + config.enable_save_optim_model(True) + else: + config = Config( + os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'), + os.path.join( + self.temp_dir.name, 'alexnet/_optimized.pdiparams' + ), + ) + config.enable_use_gpu(256, 0, PrecisionType.Half) + config.enable_memory_optim() + config.switch_ir_optim(False) + + predictor = create_predictor(config) + return predictor + + +class TestSaveOptimizedModelPassWithTRT( + TestSaveOptimizedModelPass, unittest.TestCase +): + def init_predictor(self, save_optimized_model: bool): + if save_optimized_model is True: + config = Config( + os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'), + os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'), + ) + config.enable_use_gpu(256, 0) + config.enable_tensorrt_engine( + workspace_size=1 << 30, + max_batch_size=1, + min_subgraph_size=3, + precision_mode=PrecisionType.Half, + use_static=True, + use_calib_mode=False, + ) + config.set_trt_dynamic_shape_info( + {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]}, + {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]}, + {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]}, + ) + config.exp_disable_tensorrt_ops(["flatten_contiguous_range"]) + config.enable_memory_optim() + config.switch_ir_optim(True) + config.set_optim_cache_dir( + os.path.join(self.temp_dir.name, 'alexnet') + ) + config.enable_save_optim_model(True) + else: + config = Config( + os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'), + os.path.join( + self.temp_dir.name, 'alexnet/_optimized.pdiparams' + ), + ) + config.enable_use_gpu(256, 0) + config.enable_tensorrt_engine( + workspace_size=1 << 30, + max_batch_size=1, + min_subgraph_size=3, + precision_mode=PrecisionType.Half, + use_static=True, + use_calib_mode=False, + ) + config.enable_memory_optim() + config.switch_ir_optim(False) + + predictor = create_predictor(config) + return predictor + + +if __name__ == '__main__': + unittest.main()