未验证 提交 6b10c0e5 编写于 作者: Y Yuanle Liu 提交者: GitHub

[Inference] save_optimized_model_pass support tensorrt (#55893)

* fix cudnn 8.7+ bug on cudnnConvolutionBiasActivationForward

* save_optimized_model_pass support tensorrt

* update

* update

* fix compile

* update

* fix ut timeout
上级 68b0cf92
...@@ -249,7 +249,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { ...@@ -249,7 +249,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
subgraphes_[i] = graph->GetSubGraph(i); subgraphes_[i] = graph->GetSubGraph(i);
all_op_nodes_[i] = TopologySortOperations(*subgraphes_[i]); all_op_nodes_[i] = TopologySortOperations(*subgraphes_[i]);
VLOG(4) << "subgraph " << i << " has " << all_op_nodes_[i].size() VLOG(4) << "subgraph " << i << " has " << all_op_nodes_[i].size()
<< "op nodes"; << " op nodes";
for (auto* var_node : subgraphes_[i]->Nodes()) { for (auto* var_node : subgraphes_[i]->Nodes()) {
if (!var_node->IsVar()) continue; if (!var_node->IsVar()) continue;
......
...@@ -64,10 +64,6 @@ void NaiveExecutor::Run() { ...@@ -64,10 +64,6 @@ void NaiveExecutor::Run() {
VLOG(4) << std::this_thread::get_id() << " run " VLOG(4) << std::this_thread::get_id() << " run "
<< op->DebugStringEx(scope_) << " on scope " << scope_; << op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false); op->SetIsCalledByExecutor(false);
#ifdef PADDLE_WITH_NVTX
platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
platform::NvtxRangeColor::Green);
#endif
for (auto &func : input_hookfuncs_) { for (auto &func : input_hookfuncs_) {
func(op.get(), scope_); func(op.get(), scope_);
...@@ -77,7 +73,14 @@ void NaiveExecutor::Run() { ...@@ -77,7 +73,14 @@ void NaiveExecutor::Run() {
op->SetOutputHooks(output_hookfuncs_); op->SetOutputHooks(output_hookfuncs_);
} }
#ifdef PADDLE_WITH_NVTX
platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
platform::NvtxRangeColor::Green);
#endif
op->Run(*scope_, place_); op->Run(*scope_, place_);
#ifdef PADDLE_WITH_NVTX
platform::CudaNvtxRangePop();
#endif
// Update the shared_holder so that only records the max one. // Update the shared_holder so that only records the max one.
if (reuse_cache_.count(op.get())) { if (reuse_cache_.count(op.get())) {
...@@ -105,9 +108,6 @@ void NaiveExecutor::Run() { ...@@ -105,9 +108,6 @@ void NaiveExecutor::Run() {
} }
} }
#ifdef PADDLE_WITH_NVTX
platform::CudaNvtxRangePop();
#endif
for (auto &func : output_hookfuncs_) { for (auto &func : output_hookfuncs_) {
func(op.get(), scope_); func(op.get(), scope_);
} }
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include <fcntl.h> #include <fcntl.h>
#include <cstddef> #include <cstddef>
#include <memory>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
...@@ -32,6 +34,7 @@ ...@@ -32,6 +34,7 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/phi/common/backend.h" #include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
...@@ -124,11 +127,6 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -124,11 +127,6 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
framework::ir::Graph *graph) const { framework::ir::Graph *graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
static std::once_flag trt_plugin_registered;
std::call_once(trt_plugin_registered, []() {
tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
});
auto model_precision = auto model_precision =
static_cast<phi::DataType>(Get<int>("model_precision")); static_cast<phi::DataType>(Get<int>("model_precision"));
if (model_precision == phi::DataType::BFLOAT16) { if (model_precision == phi::DataType::BFLOAT16) {
...@@ -291,7 +289,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -291,7 +289,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
// Add new block for TensorRTEngineOP // Add new block for TensorRTEngineOP
const framework::BlockDesc &main_block = const framework::BlockDesc &main_block =
program_desc->Block(framework::kRootBlockIndex); program_desc->Block(framework::kRootBlockIndex);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// A fake block desc. // A fake block desc.
...@@ -319,9 +316,9 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -319,9 +316,9 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
// is unique. // is unique.
std::set<std::string> input_names; std::set<std::string> input_names;
std::set<std::string> input_names_with_id; std::set<std::string> input_names_with_id;
std::vector<std::string> params; std::vector<std::string> parameters;
// if we delete fluid copy of params shared by more than 1 ops, there will be // if we delete fluid copy of parameters shared by more than 1 ops, there will
// problem, so we filter them out. // be problem, so we filter them out.
std::vector<std::string> params_not_shared; std::vector<std::string> params_not_shared;
auto *scope = param_scope(); auto *scope = param_scope();
...@@ -330,7 +327,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -330,7 +327,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
input_names.insert(x->Name()); input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id())); input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name()); parameters.push_back(x->Name());
} }
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 && if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
x->outputs.size() <= 1) { x->outputs.size() <= 1) {
...@@ -340,33 +337,15 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -340,33 +337,15 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
// So we reserved a name for later use when casting INT64 -> INT32 or // So we reserved a name for later use when casting INT64 -> INT32 or
// FP64->FP32. We must check whether scope has had the same name var! // FP64->FP32. We must check whether scope has had the same name var!
if (x->Var()->GetDataType() == framework::proto::VarType::INT64) { if (x->Var()->GetDataType() == framework::proto::VarType::INT64) {
std::string tmp_name = x->Name() + "_cast_to_INT32";
LOG(WARNING) LOG(WARNING)
<< "tensorrt_subgraph's input named " << x->Name() << "tensorrt_subgraph's input named " << x->Name()
<< " having int64 dtype in pdmodel description, we will cast them to " << " having int64 dtype in pdmodel description, we will cast them to "
"int32 dtype to feed them into paddle-trt."; "int32 dtype to feed them into paddle-trt.";
/*
PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
nullptr,
platform::errors::InvalidArgument(
"The var name %s has exists in scope.",
tmp_name));
*/
scope->Var(tmp_name);
} else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) { } else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) {
std::string tmp_name = x->Name() + "_cast_to_FP32";
LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name() LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name()
<< " having float64 dtype in pdmodel description, we will " << " having float64 dtype in pdmodel description, we will "
"cast them to " "cast them to "
"float32 dtype to feed them into paddle-trt."; "float32 dtype to feed them into paddle-trt.";
/*
PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
nullptr,
platform::errors::InvalidArgument(
"The var name %s has exists in scope.",
tmp_name));
*/
scope->Var(tmp_name);
} }
} }
...@@ -412,10 +391,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -412,10 +391,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
graph_var_map[node->Name()] = node; graph_var_map[node->Name()] = node;
} }
} }
auto precision_mode = Get<int>("trt_precision_mode"); auto precision_mode =
static_cast<phi::DataType>(Get<int>("trt_precision_mode"));
bool enable_fp16 = false; bool enable_fp16 = false;
if (precision_mode == static_cast<int>(phi::DataType::FLOAT16)) if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true;
enable_fp16 = true;
auto enable_int8 = Get<bool>("enable_int8"); auto enable_int8 = Get<bool>("enable_int8");
auto use_calib_mode = Get<bool>("use_calib_mode"); auto use_calib_mode = Get<bool>("use_calib_mode");
auto &subgraph_nodes = *framework::ir::Agent(node).subgraph(); auto &subgraph_nodes = *framework::ir::Agent(node).subgraph();
...@@ -423,14 +402,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -423,14 +402,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
Get<std::map<std::string, std::vector<int>>>("min_input_shape"); Get<std::map<std::string, std::vector<int>>>("min_input_shape");
auto max_input_shape = auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape"); Get<std::map<std::string, std::vector<int>>>("max_input_shape");
auto opt_input_shape = auto optim_input_shape =
Get<std::map<std::string, std::vector<int>>>("optim_input_shape"); Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
auto min_shape_tensor = auto min_shape_tensor =
Get<std::map<std::string, std::vector<int>>>("min_shape_tensor"); Get<std::map<std::string, std::vector<int>>>("min_shape_tensor");
auto max_shape_tensor = auto max_shape_tensor =
Get<std::map<std::string, std::vector<int>>>("max_shape_tensor"); Get<std::map<std::string, std::vector<int>>>("max_shape_tensor");
auto opt_shape_tensor = auto optim_shape_tensor =
Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor"); Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor");
auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime"); auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
...@@ -444,10 +423,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -444,10 +423,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
inference::DeserializeShapeRangeInfo(shape_range_info_path, inference::DeserializeShapeRangeInfo(shape_range_info_path,
&min_input_shape, &min_input_shape,
&max_input_shape, &max_input_shape,
&opt_input_shape, &optim_input_shape,
&min_shape_tensor, &min_shape_tensor,
&max_shape_tensor, &max_shape_tensor,
&opt_shape_tensor); &optim_shape_tensor);
} else { } else {
shape_range_info_path = shape_range_info_path =
Get<std::string>("model_opt_cache_dir") + "shape_range_info.pbtxt"; Get<std::string>("model_opt_cache_dir") + "shape_range_info.pbtxt";
...@@ -457,10 +436,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -457,10 +436,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
inference::DeserializeShapeRangeInfo(shape_range_info_path, inference::DeserializeShapeRangeInfo(shape_range_info_path,
&min_input_shape, &min_input_shape,
&max_input_shape, &max_input_shape,
&opt_input_shape, &optim_input_shape,
&min_shape_tensor, &min_shape_tensor,
&max_shape_tensor, &max_shape_tensor,
&opt_shape_tensor); &optim_shape_tensor);
} else { } else {
int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644); int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644);
close(fd); close(fd);
...@@ -509,32 +488,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -509,32 +488,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
if (static_cast<framework::proto::VarType_Type>( if (static_cast<framework::proto::VarType_Type>(
map_origin_outputs_dtype[name]) == map_origin_outputs_dtype[name]) ==
framework::proto::VarType::INT64) { framework::proto::VarType::INT64) {
std::string tmp_name = name + "_cast_to_INT64";
LOG(WARNING) << "tensorrt_subgraph's output named " << name LOG(WARNING) << "tensorrt_subgraph's output named " << name
<< " having int64 dtype in pdmodel description, but in fact " << " having int64 dtype in pdmodel description, but in fact "
"it is int32 " "it is int32 "
"dtype after executing this tensorrt_subgraph, so we " "dtype after executing this tensorrt_subgraph, so we "
"need cast them into int64."; "need cast them into int64.";
PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
nullptr,
platform::errors::InvalidArgument(
"The var name %s has exists in scope.", tmp_name));
scope->Var(tmp_name);
} else if (static_cast<framework::proto::VarType_Type>( } else if (static_cast<framework::proto::VarType_Type>(
map_origin_outputs_dtype[name]) == map_origin_outputs_dtype[name]) ==
framework::proto::VarType::FP64) { framework::proto::VarType::FP64) {
std::string tmp_name = name + "_cast_to_FP64";
LOG(WARNING) LOG(WARNING)
<< "tensorrt_subgraph's output named " << name << "tensorrt_subgraph's output named " << name
<< " having float64 dtype in pdmodel description, but in fact " << " having float64 dtype in pdmodel description, but in fact "
"it is float32 " "it is float32 "
"dtype after executing this tensorrt_subgraph, so we " "dtype after executing this tensorrt_subgraph, so we "
"need cast them into float64."; "need cast them into float64.";
PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
nullptr,
platform::errors::InvalidArgument(
"The var name %s has exists in scope.", tmp_name));
scope->Var(tmp_name);
} }
} }
PADDLE_ENFORCE_EQ(output_mapping.empty(), PADDLE_ENFORCE_EQ(output_mapping.empty(),
...@@ -546,30 +513,73 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -546,30 +513,73 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
true, true,
platform::errors::PreconditionNotMet("the block has no var-desc")); platform::errors::PreconditionNotMet("the block has no var-desc"));
// Set attrs // Get pass attrs.
auto use_varseqlen = Get<bool>("use_varseqlen");
auto with_interleaved = Get<bool>("with_interleaved");
auto tensorrt_transformer_posid =
Get<std::string>("tensorrt_transformer_posid");
auto tensorrt_transformer_maskid =
Get<std::string>("tensorrt_transformer_maskid");
auto use_dla = Get<bool>("trt_use_dla");
auto dla_core = Get<int>("trt_dla_core");
auto use_inspector = Get<bool>("use_inspector");
auto disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
auto context_memory_sharing = Get<bool>("context_memory_sharing");
auto enable_low_precision_io = Get<bool>("enable_low_precision_io");
auto workspace_size = Get<int64_t>("workspace_size");
auto gpu_device_id = Get<int>("gpu_device_id");
// Set op's attrs.
op_desc->SetType("tensorrt_engine"); op_desc->SetType("tensorrt_engine");
op_desc->SetInput( op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end())); "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
op_desc->SetOutput( op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end())); "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetBlockAttr("sub_block", new_block);
op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype); op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype);
op_desc->SetAttr("max_batch_size", max_batch_size); op_desc->SetAttr("max_batch_size", max_batch_size);
op_desc->SetAttr("workspace_size", Get<int64_t>("workspace_size")); op_desc->SetAttr("workspace_size", workspace_size);
op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id")); op_desc->SetAttr("gpu_device_id", gpu_device_id);
op_desc->SetAttr("output_name_mapping", output_mapping); op_desc->SetAttr("output_name_mapping", output_mapping);
op_desc->SetAttr("origin_output_rank", renamed_output_rank); op_desc->SetAttr("origin_output_rank", renamed_output_rank);
op_desc->SetAttr("parameters", params); op_desc->SetAttr("parameters", parameters);
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path); op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
op_desc->SetAttr("use_inspector", Get<bool>("use_inspector")); op_desc->SetAttr("use_inspector", use_inspector);
op_desc->SetAttr("model_precision", Get<int>("model_precision"));
op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape); op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
op_desc->SetAttr("enable_low_precision_io", op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io);
Get<bool>("enable_low_precision_io"));
if (!trt_tuned_dynamic_shape) {
std::vector<std::string> dynamic_shape_names;
std::vector<int> dynamic_shape_lens;
std::vector<int> min_input_shape_vector;
std::vector<int> max_input_shape_vector;
std::vector<int> opt_input_shape_vector;
for (const auto &it : min_input_shape) {
dynamic_shape_names.push_back(it.first);
dynamic_shape_lens.push_back(it.second.size());
for (const auto &value : it.second) {
min_input_shape_vector.push_back(value);
}
}
for (const auto &it : max_input_shape) {
for (const auto &value : it.second) {
max_input_shape_vector.push_back(value);
}
}
for (const auto &it : optim_input_shape) {
for (const auto &value : it.second) {
opt_input_shape_vector.push_back(value);
}
}
op_desc->SetAttr("dynamic_shape_names", dynamic_shape_names);
op_desc->SetAttr("dynamic_shape_lens", dynamic_shape_lens);
op_desc->SetAttr("min_input_shape_vector", min_input_shape_vector);
op_desc->SetAttr("max_input_shape_vector", max_input_shape_vector);
op_desc->SetAttr("opt_input_shape_vector", opt_input_shape_vector);
}
// we record all inputs' shapes in attr to check if they are consistent // we record all inputs' shapes in attr to check if they are consistent
// with the real inputs' shapes retrieved from scope when trt runs. // with the real inputs' shapes retrieved from scope when trt runs.
...@@ -624,14 +634,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -624,14 +634,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("engine_key", engine_key); op_desc->SetAttr("engine_key", engine_key);
op_desc->SetAttr("calibration_engine_key", calibration_engine_key); op_desc->SetAttr("calibration_engine_key", calibration_engine_key);
op_desc->SetAttr("predictor_id", predictor_id); op_desc->SetAttr("predictor_id", predictor_id);
op_desc->SetAttr("use_varseqlen", use_varseqlen);
std::string trt_engine_serialized_data = ""; op_desc->SetAttr("with_interleaved", with_interleaved);
op_desc->SetAttr("use_dla", use_dla);
op_desc->SetAttr("dla_core", dla_core);
op_desc->SetAttr("disable_trt_plugin_fp16", disable_trt_plugin_fp16);
op_desc->SetAttr("context_memory_sharing", context_memory_sharing);
std::string trt_engine_serialized_data;
op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data); op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);
op_desc->Flush(); op_desc->Flush();
std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator; std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
if (enable_int8 && !calibration_data.empty()) { if (enable_int8 && !calibration_data.empty()) {
calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); calibrator =
std::make_unique<tensorrt::TRTInt8Calibrator>(calibration_data);
LOG(INFO) << "RUN Paddle TRT int8 calibration mode..."; LOG(INFO) << "RUN Paddle TRT int8 calibration mode...";
} }
// When in int8 mode and calibration_mode, the program just produce the // When in int8 mode and calibration_mode, the program just produce the
...@@ -656,7 +672,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -656,7 +672,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
"static shape mode instead."; "static shape mode instead.";
min_input_shape = {}; min_input_shape = {};
max_input_shape = {}; max_input_shape = {};
opt_input_shape = {}; optim_input_shape = {};
} }
const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION); const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
...@@ -677,42 +693,33 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -677,42 +693,33 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Agent(node).subgraph()->end()); framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not tensorrt::TensorRTEngine::ConstructionParams params;
// run fp16. params.max_batch_size = max_batch_size;
// When running fp16, the output accuracy of the model will be affected, params.max_workspace_size = workspace_size;
// closing the plugin fp16 may bring some improvement on accuracy. params.calibrator = calibrator.get();
bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16"); params.device_id = gpu_device_id;
params.with_dynamic_shape = with_dynamic_shape;
params.min_input_shape = min_input_shape;
params.max_input_shape = max_input_shape;
params.optim_input_shape = optim_input_shape;
params.min_shape_tensor = min_shape_tensor;
params.max_shape_tensor = max_shape_tensor;
params.optim_shape_tensor = optim_shape_tensor;
params.disable_trt_plugin_fp16 = disable_trt_plugin_fp16;
params.precision = precision_mode;
params.use_varseqlen = use_varseqlen;
params.use_dla = use_dla;
params.dla_core = dla_core;
params.with_interleaved = with_interleaved;
params.tensorrt_transformer_posid = tensorrt_transformer_posid;
params.tensorrt_transformer_maskid = tensorrt_transformer_maskid;
params.context_memory_sharing = context_memory_sharing;
params.use_inspector = use_inspector;
params.enable_low_precision_io = enable_low_precision_io;
tensorrt::TensorRTEngine *trt_engine = tensorrt::TensorRTEngine *trt_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id), .Create(engine_key + std::to_string(predictor_id), params);
max_batch_size,
Get<int64_t>("workspace_size"),
static_cast<phi::DataType>(precision_mode),
calibrator.get(),
Get<int>("gpu_device_id"),
with_dynamic_shape,
min_input_shape,
max_input_shape,
opt_input_shape,
min_shape_tensor,
max_shape_tensor,
opt_shape_tensor,
disable_trt_plugin_fp16,
static_cast<phi::DataType>(Get<int>("model_precision")));
trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));
trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
trt_engine->SetTransformerPosid(
Get<std::string>("tensorrt_transformer_posid"));
trt_engine->SetTransformerMaskid(
Get<std::string>("tensorrt_transformer_maskid"));
trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
trt_engine->SetDLACore(Get<int>("trt_dla_core"));
trt_engine->SetUseInspector(Get<bool>("use_inspector"));
trt_engine->SetWithErnie(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));
trt_engine->SetLowPrecisionIO(Get<bool>("enable_low_precision_io"));
if (use_static_engine) { if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData( trt_engine_serialized_data = GetTrtEngineSerializedData(
...@@ -749,13 +756,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -749,13 +756,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
"kernel etc). This process may cost a lot of time."; "kernel etc). This process may cost a lot of time.";
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end()); std::unordered_set<std::string> parameters_set(parameters.begin(),
parameters.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global() inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine( .ConvertBlockToTRTEngine(
&block_desc_temp, &block_desc_temp,
*scope, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()), std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, parameters_set,
output_mapping, output_mapping,
trt_engine); trt_engine);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
...@@ -63,8 +64,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { ...@@ -63,8 +64,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
"set.")); "set."));
} }
auto graph = std::unique_ptr<framework::ir::Graph>( auto graph = std::make_unique<framework::ir::Graph>(argument->main_program());
new framework::ir::Graph(argument->main_program()));
argument->SetMainGraph(graph.release()); argument->SetMainGraph(graph.release());
auto *scope_ptr = argument->scope_ptr(); auto *scope_ptr = argument->scope_ptr();
PADDLE_ENFORCE_NOT_NULL(scope_ptr, PADDLE_ENFORCE_NOT_NULL(scope_ptr,
......
...@@ -24,16 +24,6 @@ namespace inference { ...@@ -24,16 +24,6 @@ namespace inference {
namespace analysis { namespace analysis {
void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
if (!argument->save_optimized_model()) {
LOG(WARNING) << "save_optim_cache_model is turned off, skip "
"save_optimized_model_pass";
return;
}
if (!argument->enable_ir_optim()) {
LOG(WARNING) << "ir_optim is turned off, skip save_optimized_model_pass";
return;
}
std::string model_opt_cache_dir = argument->optim_cache_dir(); std::string model_opt_cache_dir = argument->optim_cache_dir();
if (!model_opt_cache_dir.empty()) { if (!model_opt_cache_dir.empty()) {
if (!PathExists(model_opt_cache_dir)) { if (!PathExists(model_opt_cache_dir)) {
...@@ -55,9 +45,11 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { ...@@ -55,9 +45,11 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
auto* graph = argument->main_graph_ptr(); auto* graph = argument->main_graph_ptr();
framework::ProgramDesc optimized_program_desc; framework::ProgramDesc optimized_program_desc;
// NOTE(liuyuanle): If the following line of code is not added, an error // NOTE(liuyuanle): If the following line of code is not added, an error
// [SegmentFault] may occur! // [SegmentFault] may occur!
optimized_program_desc.CopyFrom(*argument->main_program().Proto()); optimized_program_desc.CopyFrom(*argument->main_program().Proto());
framework::ir::GraphToProgram(*graph, &optimized_program_desc); framework::ir::GraphToProgram(*graph, &optimized_program_desc);
auto IsPersistable = [](const framework::VarDesc* var) { auto IsPersistable = [](const framework::VarDesc* var) {
...@@ -133,11 +125,10 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { ...@@ -133,11 +125,10 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
} }
void SaveOptimizedModelPass::RunImpl(Argument* argument) { void SaveOptimizedModelPass::RunImpl(Argument* argument) {
// TODO(inference): Support trt. if (!argument->save_optimized_model() || !argument->enable_ir_optim()) {
if (argument->use_xpu() || return;
(argument->use_gpu() && !argument->use_tensorrt())) {
SaveOptimizedModel(argument);
} }
SaveOptimizedModel(argument);
} }
std::string SaveOptimizedModelPass::repr() const { std::string SaveOptimizedModelPass::repr() const {
......
...@@ -641,7 +641,7 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -641,7 +641,7 @@ bool AnalysisPredictor::PrepareProgram(
} }
bool AnalysisPredictor::CreateExecutor() { bool AnalysisPredictor::CreateExecutor() {
executor_.reset(new paddle::framework::NaiveExecutor(place_)); executor_ = std::make_unique<paddle::framework::NaiveExecutor>(place_);
return true; return true;
} }
...@@ -1341,7 +1341,7 @@ bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs, ...@@ -1341,7 +1341,7 @@ bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
void AnalysisPredictor::PrepareArgument() { void AnalysisPredictor::PrepareArgument() {
VLOG(3) << "AnalysisPredictor::PrepareArgument"; VLOG(3) << "AnalysisPredictor::PrepareArgument";
// Init std::unique_ptr argument_. // Init std::unique_ptr argument_.
argument_.reset(new Argument); argument_ = std::make_unique<Argument>();
argument_->SetUseGPU(config_.use_gpu()); argument_->SetUseGPU(config_.use_gpu());
argument_->SetUseCutlass(config_.use_cutlass_); argument_->SetUseCutlass(config_.use_cutlass_);
argument_->SetUseFcPadding(config_.use_fc_padding()); argument_->SetUseFcPadding(config_.use_fc_padding());
...@@ -1570,7 +1570,8 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1570,7 +1570,8 @@ void AnalysisPredictor::PrepareArgument() {
if (!config_.ir_optim()) { if (!config_.ir_optim()) {
argument_->SetEnableIrOptim(false); argument_->SetEnableIrOptim(false);
if (config_.enable_gpu_mixed_) { if (config_.enable_gpu_mixed_ &&
model_precision_ == phi::DataType::FLOAT32) {
argument_->SetEnableIrOptim(true); argument_->SetEnableIrOptim(true);
pass_builder->ClearPasses(); pass_builder->ClearPasses();
pass_builder->AppendPass("auto_mixed_precision_pass"); pass_builder->AppendPass("auto_mixed_precision_pass");
...@@ -1886,6 +1887,10 @@ AnalysisPredictor::GetInputTypes() { ...@@ -1886,6 +1887,10 @@ AnalysisPredictor::GetInputTypes() {
input_type[name] = paddle_infer::DataType::UINT8; input_type[name] = paddle_infer::DataType::UINT8;
} else if (dtype == paddle::framework::proto::VarType::INT8) { } else if (dtype == paddle::framework::proto::VarType::INT8) {
input_type[name] = paddle_infer::DataType::INT8; input_type[name] = paddle_infer::DataType::INT8;
} else if (dtype == paddle::framework::proto::VarType::FP64) {
input_type[name] = paddle_infer::DataType::FLOAT64;
} else if (dtype == paddle::framework::proto::VarType::BOOL) {
input_type[name] = paddle_infer::DataType::BOOL;
} else { } else {
PADDLE_THROW(paddle::platform::errors::Unimplemented( PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Unsupported data type `%s` when get input dtype ", dtype)); "Unsupported data type `%s` when get input dtype ", dtype));
...@@ -2609,7 +2614,7 @@ AnalysisPredictor::~AnalysisPredictor() { ...@@ -2609,7 +2614,7 @@ AnalysisPredictor::~AnalysisPredictor() {
#ifdef PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
if (config_.trt_engine_memory_sharing()) { if (config_.trt_engine_memory_sharing()) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.releaseContextMemory(predictor_id_); .ReleaseContextMemory(predictor_id_);
} }
#endif #endif
} }
......
...@@ -167,7 +167,7 @@ class OpConverter { ...@@ -167,7 +167,7 @@ class OpConverter {
op_desc.Type())); op_desc.Type()));
it->SetEngine(engine); it->SetEngine(engine);
engine->SetScope(scope); engine->SetScope(&scope);
it->SetBlockDesc(block); it->SetBlockDesc(block);
(*it)(op, scope, test_mode); (*it)(op, scope, test_mode);
...@@ -301,7 +301,7 @@ class OpConverter { ...@@ -301,7 +301,7 @@ class OpConverter {
nvinfer1::DataType in_dtype = FluidDataType2TRT(var->GetDataType()); nvinfer1::DataType in_dtype = FluidDataType2TRT(var->GetDataType());
if (engine->precision() == phi::DataType::FLOAT16 && if (engine->precision() == phi::DataType::FLOAT16 &&
in_dtype == nvinfer1::DataType::kFLOAT && in_dtype == nvinfer1::DataType::kFLOAT &&
engine->EnableLowPrecisionIO()) { engine->LowPrecisionIOEnabled()) {
in_dtype = nvinfer1::DataType::kHALF; in_dtype = nvinfer1::DataType::kHALF;
} }
...@@ -360,7 +360,7 @@ class OpConverter { ...@@ -360,7 +360,7 @@ class OpConverter {
nvinfer1::DataType out_dtype = FluidDataType2TRT(var->GetDataType()); nvinfer1::DataType out_dtype = FluidDataType2TRT(var->GetDataType());
if (engine->precision() == phi::DataType::FLOAT16 && if (engine->precision() == phi::DataType::FLOAT16 &&
out_dtype == nvinfer1::DataType::kFLOAT && out_dtype == nvinfer1::DataType::kFLOAT &&
engine->EnableLowPrecisionIO()) { engine->LowPrecisionIOEnabled()) {
out_dtype = nvinfer1::DataType::kHALF; out_dtype = nvinfer1::DataType::kHALF;
} }
engine->DeclareOutput(output, out_dtype); engine->DeclareOutput(output, out_dtype);
...@@ -470,7 +470,7 @@ class OpConverter { ...@@ -470,7 +470,7 @@ class OpConverter {
auto shape = newShape->getDimensions(); auto shape = newShape->getDimensions();
shuffle->setReshapeDimensions(shape); shuffle->setReshapeDimensions(shape);
} }
if (name != "") { if (!name.empty()) {
shuffle->setName(name.c_str()); shuffle->setName(name.c_str());
} }
return shuffle->getOutput(0); return shuffle->getOutput(0);
...@@ -481,7 +481,7 @@ class OpConverter { ...@@ -481,7 +481,7 @@ class OpConverter {
const std::string& name = "") { const std::string& name = "") {
auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
shuffle->setReshapeDimensions(shape); shuffle->setReshapeDimensions(shape);
if (name != "") { if (!name.empty()) {
shuffle->setName(name.c_str()); shuffle->setName(name.c_str());
} }
return shuffle->getOutput(0); return shuffle->getOutput(0);
...@@ -774,11 +774,6 @@ class OpConverter { ...@@ -774,11 +774,6 @@ class OpConverter {
bool test_mode_; bool test_mode_;
private: private:
// registered op converter map, whose key is the fluid op type, and value is
// the pointer position of corresponding OpConverter class.
std::unordered_map<std::string, OpConverter*> converters_;
// fluid inference scope
framework::Scope* scope_{nullptr};
std::mutex mut_; std::mutex mut_;
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> // NOLINT #include <gtest/gtest.h> // NOLINT
#include <memory>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
...@@ -95,7 +96,11 @@ TEST(CustomPluginCreater, StaticShapePlugin) { ...@@ -95,7 +96,11 @@ TEST(CustomPluginCreater, StaticShapePlugin) {
// init trt engine // init trt engine
std::unique_ptr<TensorRTEngine> engine_; std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15));
TensorRTEngine::ConstructionParams params;
params.max_batch_size = 5;
params.max_workspace_size = 1 << 15;
engine_ = std::make_unique<TensorRTEngine>(params);
engine_->InitNetwork(); engine_->InitNetwork();
engine_->DeclareInput( engine_->DeclareInput(
...@@ -173,15 +178,10 @@ TEST(CustomPluginCreater, DynamicShapePlugin) { ...@@ -173,15 +178,10 @@ TEST(CustomPluginCreater, DynamicShapePlugin) {
std::map<std::string, std::vector<int>> optim_input_shape = { std::map<std::string, std::vector<int>> optim_input_shape = {
{"x", {1, 2, 5, 5}}}; {"x", {1, 2, 5, 5}}};
engine_.reset(new TensorRTEngine(5, TensorRTEngine::ConstructionParams params;
1 << 15, params.max_batch_size = 5;
phi::DataType::FLOAT32, params.max_workspace_size = 1 << 15;
nullptr, engine_ = std::make_unique<TensorRTEngine>(params);
0,
true,
min_input_shape,
max_input_shape,
optim_input_shape));
engine_->InitNetwork(); engine_->InitNetwork();
LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape(); LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape();
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include <gtest/gtest.h> // NOLINT #include <gtest/gtest.h> // NOLINT
#include <memory>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
...@@ -28,7 +30,10 @@ TEST(OpConverter, ConvertBlock) { ...@@ -28,7 +30,10 @@ TEST(OpConverter, ConvertBlock) {
// init trt engine // init trt engine
std::unique_ptr<TensorRTEngine> engine_; std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15)); TensorRTEngine::ConstructionParams params;
params.max_batch_size = 5;
params.max_workspace_size = 1 << 15;
engine_ = std::make_unique<TensorRTEngine>(params);
engine_->InitNetwork(); engine_->InitNetwork();
engine_->DeclareInput( engine_->DeclareInput(
......
...@@ -88,7 +88,10 @@ class TRTConvertValidation { ...@@ -88,7 +88,10 @@ class TRTConvertValidation {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_),
0, 0,
platform::errors::External("cudaStreamCreate error.")); platform::errors::External("cudaStreamCreate error."));
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size)); TensorRTEngine::ConstructionParams params;
params.max_batch_size = max_batch_size;
params.max_workspace_size = workspace_size;
engine_ = std::make_unique<TensorRTEngine>(params);
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -155,7 +158,7 @@ class TRTConvertValidation { ...@@ -155,7 +158,7 @@ class TRTConvertValidation {
engine_->FreezeNetwork(); engine_->FreezeNetwork();
// Declare outputs. // Declare outputs.
op_desc_.reset(new framework::OpDesc(desc, nullptr)); op_desc_ = std::make_unique<framework::OpDesc>(desc, nullptr);
} }
// We use the set 'neglected_output' here, because some Ops like batch norm, // We use the set 'neglected_output' here, because some Ops like batch norm,
......
...@@ -21,17 +21,15 @@ limitations under the License. */ ...@@ -21,17 +21,15 @@ limitations under the License. */
#include "NvInferRuntimeCommon.h" #include "NvInferRuntimeCommon.h"
#include "cuda_runtime_api.h" // NOLINT #include "cuda_runtime_api.h" // NOLINT
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
thread_local int TensorRTEngine::predictor_id_per_thread = -1; thread_local int TensorRTEngine::predictor_id_per_thread = -1;
void TensorRTEngine::Weight::SetDataType(phi::DataType type) { void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
...@@ -64,10 +62,10 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) { ...@@ -64,10 +62,10 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
} }
void TensorRTEngine::InitNetwork() { void TensorRTEngine::InitNetwork() {
freshDeviceId(); FreshDeviceId();
infer_builder_.reset(createInferBuilder(&logger_)); infer_builder_.reset(createInferBuilder(&logger_));
if (with_dynamic_shape_) { if (with_dynamic_shape()) {
infer_network_.reset(infer_builder_->createNetworkV2( infer_network_.reset(infer_builder_->createNetworkV2(
1U << static_cast<int>( 1U << static_cast<int>(
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
...@@ -92,7 +90,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { ...@@ -92,7 +90,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
// IExecutionContext... // IExecutionContext...
// It's ok. We will set it later. // It's ok. We will set it later.
nvinfer1::IExecutionContext *infer_context{nullptr}; nvinfer1::IExecutionContext *infer_context{nullptr};
if (context_memory_sharing_) { if (params_.context_memory_sharing) {
infer_context = infer_context =
infer_engine_->createExecutionContextWithoutDeviceMemory(); infer_engine_->createExecutionContextWithoutDeviceMemory();
} else { } else {
...@@ -102,7 +100,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { ...@@ -102,7 +100,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
infer_context, infer_context,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"TensorRT engine can not build execution context.")); "TensorRT engine can not build execution context."));
if (with_dynamic_shape_) { if (with_dynamic_shape()) {
// need new profile if it's not the first // need new profile if it's not the first
if (cur_profile_num_ > 0) { if (cur_profile_num_ > 0) {
infer_context->setOptimizationProfile(cur_profile_num_); infer_context->setOptimizationProfile(cur_profile_num_);
...@@ -118,15 +116,15 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { ...@@ -118,15 +116,15 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
void TensorRTEngine::Execute(int batch_size, void TensorRTEngine::Execute(int batch_size,
std::vector<void *> *buffers, std::vector<void *> *buffers,
cudaStream_t stream) { cudaStream_t stream) {
freshDeviceId(); FreshDeviceId();
auto infer_context = context(); auto infer_context = context();
if (context_memory_sharing_) { if (params_.context_memory_sharing) {
void *context_memory{nullptr}; void *context_memory{nullptr};
context_memory = context_memory =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.getContextMemory( .GetContextMemory(
predictor_id_per_thread, predictor_id_per_thread,
phi::GPUPlace(device_id_), phi::GPUPlace(device_id()),
phi::Stream(reinterpret_cast<phi::StreamId>(stream))); phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory); infer_context->setDeviceMemory(context_memory);
} }
...@@ -182,12 +180,11 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context, ...@@ -182,12 +180,11 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context,
} else { } else {
ret = context->enqueueV2(buffers->data(), stream, nullptr); ret = context->enqueueV2(buffers->data(), stream, nullptr);
} }
SetRuntimeBatch(batch_size);
return ret; return ret;
} }
void TensorRTEngine::FreezeNetwork() { void TensorRTEngine::FreezeNetwork() {
freshDeviceId(); FreshDeviceId();
VLOG(3) << "TRT to freeze network"; VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE_NOT_NULL(infer_builder_, PADDLE_ENFORCE_NOT_NULL(infer_builder_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -197,17 +194,17 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -197,17 +194,17 @@ void TensorRTEngine::FreezeNetwork() {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Call InitNetwork first to initialize network.")); "Call InitNetwork first to initialize network."));
// build engine. // build engine.
if (!with_dynamic_shape_) { if (!with_dynamic_shape()) {
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(params_.max_batch_size);
} }
#if IS_TRT_VERSION_GE(8300) #if IS_TRT_VERSION_GE(8300)
infer_builder_config_->setMemoryPoolLimit( infer_builder_config_->setMemoryPoolLimit(
nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_); nvinfer1::MemoryPoolType::kWORKSPACE, params_.max_workspace_size);
#else #else
infer_builder_config_->setMaxWorkspaceSize(max_workspace_); infer_builder_config_->setMaxWorkspaceSize(params_.max_workspace_size);
#endif #endif
bool enable_fp16 = (precision_ == phi::DataType::FLOAT16); bool enable_fp16 = (precision() == phi::DataType::FLOAT16);
if (enable_fp16) { if (enable_fp16) {
bool support_fp16 = infer_builder_->platformHasFastFp16(); bool support_fp16 = infer_builder_->platformHasFastFp16();
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
...@@ -219,15 +216,15 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -219,15 +216,15 @@ void TensorRTEngine::FreezeNetwork() {
} }
} }
bool enable_int8 = (precision_ == phi::DataType::INT8); bool enable_int8 = (precision() == phi::DataType::INT8);
if (enable_int8) { if (enable_int8) {
if (!use_dla_) { if (!use_dla()) {
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
} }
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
if (calibrator_) { if (params_.calibrator) {
infer_builder_config_->setInt8Calibrator(calibrator_); infer_builder_config_->setInt8Calibrator(params_.calibrator);
} else { } else {
infer_builder_config_->setInt8Calibrator(nullptr); infer_builder_config_->setInt8Calibrator(nullptr);
...@@ -259,7 +256,7 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -259,7 +256,7 @@ void TensorRTEngine::FreezeNetwork() {
} }
} }
if (use_dla_) { if (use_dla()) {
if (!enable_int8 && !enable_fp16) { if (!enable_int8 && !enable_fp16) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."; "set float32, so DLA is not used.";
...@@ -268,42 +265,43 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -268,42 +265,43 @@ void TensorRTEngine::FreezeNetwork() {
<< "TensorRT DLA is set by config, but your device does not have " << "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."; "DLA, so DLA is not used.";
} else { } else {
if (dla_core_ < 0 || dla_core_ >= infer_builder_->getNbDLACores()) { if (params_.dla_core < 0 ||
dla_core_ = 0; params_.dla_core >= infer_builder_->getNbDLACores()) {
params_.dla_core = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< infer_builder_->getNbDLACores() << ", but got " << infer_builder_->getNbDLACores() << ", but got "
<< dla_core_ << ", so use use 0 as default."; << params_.dla_core << ", so use use 0 as default.";
} }
infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
infer_builder_config_->setDLACore(dla_core_); infer_builder_config_->setDLACore(params_.dla_core);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore " LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore "
<< dla_core_; << params_.dla_core;
} }
} }
if (with_dynamic_shape_) { if (with_dynamic_shape()) {
LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
for (int i = 0; i < max_profile_num_; i++) { for (int i = 0; i < max_profile_num_; i++) {
for (auto &input : min_input_shape_) { for (auto &input : min_input_shape()) {
#if IS_TRT_VERSION_LT(7100) #if IS_TRT_VERSION_LT(7100)
// trt6/trt7011 will check all_of input > 0 // trt6/trt7011 will check all_of input > 0
if (!(std::all_of(input.second.begin(), if (!(std::all_of(input.second.begin(),
input.second.end(), input.second.end(),
[](int x) { return x > 0; }) && [](int x) { return x > 0; }) &&
std::all_of(max_input_shape_[input.first].begin(), std::all_of(max_input_shape()[input.first].begin(),
max_input_shape_[input.first].end(), max_input_shape()[input.first].end(),
[](int x) { return x > 0; }) && [](int x) { return x > 0; }) &&
std::all_of(optim_input_shape_[input.first].begin(), std::all_of(optim_input_shape()[input.first].begin(),
optim_input_shape_[input.first].end(), optim_input_shape()[input.first].end(),
[](int x) { return x > 0; }))) { [](int x) { return x > 0; }))) {
continue; continue;
} }
#endif #endif
VLOG(4) << "TRT dynamic_shape set " << input.first VLOG(4) << "TRT dynamic_shape set " << input.first
<< " min: " << Vec2Str(input.second) << " min: " << Vec2Str(input.second)
<< ", max: " << Vec2Str(max_input_shape_[input.first]) << ", max: " << Vec2Str(max_input_shape()[input.first])
<< ", opt: " << Vec2Str(optim_input_shape_[input.first]); << ", opt: " << Vec2Str(optim_input_shape()[input.first]);
optim_profiles_[i]->setDimensions( optim_profiles_[i]->setDimensions(
input.first.c_str(), input.first.c_str(),
...@@ -312,38 +310,39 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -312,38 +310,39 @@ void TensorRTEngine::FreezeNetwork() {
optim_profiles_[i]->setDimensions( optim_profiles_[i]->setDimensions(
input.first.c_str(), input.first.c_str(),
nvinfer1::OptProfileSelector::kMAX, nvinfer1::OptProfileSelector::kMAX,
Vec2TRT_Dims(max_input_shape_[input.first], input.first, true)); Vec2TRT_Dims(max_input_shape()[input.first], input.first, true));
optim_profiles_[i]->setDimensions( optim_profiles_[i]->setDimensions(
input.first.c_str(), input.first.c_str(),
nvinfer1::OptProfileSelector::kOPT, nvinfer1::OptProfileSelector::kOPT,
Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); Vec2TRT_Dims(optim_input_shape()[input.first], input.first, true));
} }
for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) { for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) {
auto input_name = network()->getInput(input_id)->getName(); auto input_name = network()->getInput(input_id)->getName();
if (!itensor_map_.count(input_name)) continue; if (!itensor_map_.count(input_name)) continue;
if (!GetITensor(input_name)->isShapeTensor()) continue; if (!GetITensor(input_name)->isShapeTensor()) continue;
PADDLE_ENFORCE_EQ(min_shape_tensor_.count(input_name) && PADDLE_ENFORCE_EQ(min_shape_tensor().count(input_name) > 0 &&
max_shape_tensor_.count(input_name) && max_shape_tensor().count(input_name) > 0 &&
optim_shape_tensor_.count(input_name), optim_shape_tensor().count(input_name) > 0,
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Fail to find min/max/optim shape value for TRT " "Fail to find min/max/optim shape value for TRT "
"network's shape tensor input named %s.", "network's shape tensor input named %s.",
input_name)); input_name));
auto min_vec = min_shape_tensor_.at(input_name); auto min_vec = min_shape_tensor().at(input_name);
optim_profiles_[i]->setShapeValues(input_name, optim_profiles_[i]->setShapeValues(input_name,
nvinfer1::OptProfileSelector::kMIN, nvinfer1::OptProfileSelector::kMIN,
min_vec.data(), min_vec.data(),
min_vec.size()); min_vec.size());
optim_profiles_[i]->setShapeValues(input_name, optim_profiles_[i]->setShapeValues(
input_name,
nvinfer1::OptProfileSelector::kMAX, nvinfer1::OptProfileSelector::kMAX,
max_shape_tensor_[input_name].data(), max_shape_tensor()[input_name].data(),
min_vec.size()); min_vec.size());
optim_profiles_[i]->setShapeValues( optim_profiles_[i]->setShapeValues(
input_name, input_name,
nvinfer1::OptProfileSelector::kOPT, nvinfer1::OptProfileSelector::kOPT,
optim_shape_tensor_[input_name].data(), optim_shape_tensor()[input_name].data(),
min_vec.size()); min_vec.size());
} }
...@@ -358,7 +357,7 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -358,7 +357,7 @@ void TensorRTEngine::FreezeNetwork() {
} }
} }
#if IS_TRT_VERSION_GE(8200) #if IS_TRT_VERSION_GE(8200)
if (use_inspector_) { if (params_.use_inspector) {
infer_builder_config_->setProfilingVerbosity( infer_builder_config_->setProfilingVerbosity(
nvinfer1::ProfilingVerbosity::kDETAILED); nvinfer1::ProfilingVerbosity::kDETAILED);
} }
...@@ -388,12 +387,12 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -388,12 +387,12 @@ void TensorRTEngine::FreezeNetwork() {
cur_profile_num_ = 0; cur_profile_num_ = 0;
} }
// for engine context memory sharing // for engine context memory sharing
if (context_memory_sharing_) { if (params_.context_memory_sharing) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(), .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread); predictor_id_per_thread);
} }
if (use_inspector_) { if (params_.use_inspector) {
GetEngineInfo(); GetEngineInfo();
} }
} }
...@@ -539,7 +538,7 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor( ...@@ -539,7 +538,7 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor(
} }
// In fact , this is not always right, because we can't determine if the 0th // In fact , this is not always right, because we can't determine if the 0th
// dimension is batch. Just for run chenqu's model // dimension is batch. Just for run chenqu's model
if (!this->with_dynamic_shape()) { if (!with_dynamic_shape()) {
trt_in_shape.nbDims--; trt_in_shape.nbDims--;
for (int i = 0; i < trt_in_shape.nbDims; i++) { for (int i = 0; i < trt_in_shape.nbDims; i++) {
trt_in_shape.d[i] = trt_in_shape.d[i + 1]; trt_in_shape.d[i] = trt_in_shape.d[i + 1];
...@@ -563,12 +562,12 @@ std::unordered_map<std::string, nvinfer1::ITensor *> ...@@ -563,12 +562,12 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
} }
void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
freshDeviceId(); FreshDeviceId();
infer_runtime_.reset(createInferRuntime(&logger_)); infer_runtime_.reset(createInferRuntime(&logger_));
if (use_dla_) { if (use_dla()) {
if (precision_ != phi::DataType::INT8 && if (precision() != phi::DataType::INT8 &&
precision_ != phi::DataType::FLOAT16) { precision() != phi::DataType::FLOAT16) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."; "set float32, so DLA is not used.";
} else if (infer_runtime_->getNbDLACores() == 0) { } else if (infer_runtime_->getNbDLACores() == 0) {
...@@ -576,15 +575,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { ...@@ -576,15 +575,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
<< "TensorRT DLA is set by config, but your device does not have " << "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."; "DLA, so DLA is not used.";
} else { } else {
if (dla_core_ < 0 || dla_core_ >= infer_runtime_->getNbDLACores()) { if (params_.dla_core < 0 ||
dla_core_ = 0; params_.dla_core >= infer_runtime_->getNbDLACores()) {
params_.dla_core = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< infer_runtime_->getNbDLACores() << ", but got " << infer_runtime_->getNbDLACores() << ", but got "
<< dla_core_ << ", so use use 0 as default."; << params_.dla_core << ", so use use 0 as default.";
} }
infer_runtime_->setDLACore(dla_core_); infer_runtime_->setDLACore(params_.dla_core);
LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore " LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
<< dla_core_; << params_.dla_core;
} }
} }
...@@ -602,20 +602,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { ...@@ -602,20 +602,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
binding_num_ = infer_engine_->getNbBindings(); binding_num_ = infer_engine_->getNbBindings();
// for engine context memory sharing // for engine context memory sharing
if (context_memory_sharing_) { if (params_.context_memory_sharing) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(), .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread); predictor_id_per_thread);
} }
if (use_inspector_) { if (params_.use_inspector) {
GetEngineInfo(); GetEngineInfo();
} }
} }
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
// Note: Only for support plugin. // Note: Only for support plugin.
TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight( TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
const std::string &name, const phi::DenseTensor &weight_tensor) { const std::string &name, const phi::DenseTensor &weight_tensor) {
...@@ -830,8 +826,6 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight( ...@@ -830,8 +826,6 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
return weight; return weight;
} }
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin( nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, nvinfer1::ITensor *const *inputs,
int num_inputs, int num_inputs,
...@@ -856,16 +850,16 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt( ...@@ -856,16 +850,16 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt(
return network()->addPluginV2(inputs, num_inputs, *plugin); return network()->addPluginV2(inputs, num_inputs, *plugin);
} }
void TensorRTEngine::freshDeviceId() { void TensorRTEngine::FreshDeviceId() {
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, PADDLE_ENFORCE_LT(device_id(),
count, count,
platform::errors::OutOfRange( platform::errors::OutOfRange(
"Device id %d exceeds the current device count: %d.", "Device id %d exceeds the current device count: %d.",
device_id_, device_id(),
count)); count));
platform::SetDeviceId(device_id_); platform::SetDeviceId(device_id());
} }
void TensorRTEngine::GetEngineInfo() { void TensorRTEngine::GetEngineInfo() {
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <NvInfer.h>
#include <cstdint> #include <cstdint>
#include <map> #include <map>
#include <memory> #include <memory>
...@@ -25,21 +23,21 @@ limitations under the License. */ ...@@ -25,21 +23,21 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <NvInfer.h>
#include "NvInferRuntimeCommon.h" #include "NvInferRuntimeCommon.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/flags.h" #include "paddle/phi/core/flags.h"
#include "paddle/phi/core/stream.h" #include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h"
PHI_DECLARE_bool(trt_ibuilder_cache); PHI_DECLARE_bool(trt_ibuilder_cache);
...@@ -47,6 +45,12 @@ namespace paddle { ...@@ -47,6 +45,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class PluginTensorRT;
} // namespace plugin
class TRTInt8Calibrator;
// The code is mainly from TensorRT, thanks to the project. // The code is mainly from TensorRT, thanks to the project.
class TrtCudaGraph { class TrtCudaGraph {
public: public:
...@@ -105,140 +109,6 @@ class TrtCudaGraph { ...@@ -105,140 +109,6 @@ class TrtCudaGraph {
cudaGraphExec_t cuda_graph_exec_{}; cudaGraphExec_t cuda_graph_exec_{};
}; };
namespace plugin {
class PluginTensorRT;
} // namespace plugin
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
case FluidDT::VarType_Type_FP64:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
case FluidDT::VarType_Type_INT64:
return TRT_DT::kINT32;
case FluidDT::VarType_Type_FP16:
return TRT_DT::kHALF;
#if IS_TRT_VERSION_GE(8400)
case FluidDT::VarType_Type_BOOL:
return TRT_DT::kBOOL;
#endif
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"unsupported datatype in TRT op converter, type: %s. "
"Boolean type is supported as TRT input/output "
"using TensorRT v8.4+.",
VarType_Type_Name(type)));
}
return TRT_DT::kINT32;
}
// The T can be int32 or int64 type.
template <typename T>
nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
std::string input,
bool with_dynamic_shape = false) {
PADDLE_ENFORCE_GE(shape.size(),
0UL,
platform::errors::InvalidArgument(
"TensorRT's tensor input requires at least 0 "
"dimensions, but input %s has %d dims.",
input,
shape.size()));
auto ShapeStr = [](const std::vector<T>& shape) {
std::ostringstream os;
os << "[";
for (size_t i = 0; i < shape.size(); ++i) {
if (i == shape.size() - 1) {
os << shape[i];
} else {
os << shape[i] << ",";
}
}
os << "]";
return os.str();
};
if (!with_dynamic_shape) {
if (shape.size() == 4UL) {
if (shape[2] == -1 || shape[3] == -1) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims3(shape[1], shape[2], shape[3]);
} else if (shape.size() == 5UL) {
if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]);
} else if (shape.size() == 3UL) {
if (shape[1] == -1 || shape[2] == -1) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims2(shape[1], shape[2]);
} else if (shape.size() == 2UL) {
if (shape[1] == -1) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
nvinfer1::Dims dims;
dims.nbDims = 1;
dims.d[0] = shape[1];
return dims;
}
// static shape doesn't support 1D op so far.
PADDLE_ENFORCE_NE(shape.size(),
1UL,
platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s."
"it's not supported by trt so far",
input,
ShapeStr(shape)));
nvinfer1::Dims dims;
dims.nbDims = shape.size() - 1;
for (size_t i = 1; i < shape.size(); i++) {
dims.d[i - 1] = shape[i];
}
return dims;
} else {
if (shape.size() == 4UL) {
return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
} else if (shape.size() == 3UL) {
return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
}
nvinfer1::Dims dims;
dims.nbDims = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
dims.d[i] = shape[i];
}
return dims;
}
}
} // namespace
class TRTInt8Calibrator;
/* /*
* TensorRT Engine. * TensorRT Engine.
* *
...@@ -251,6 +121,54 @@ class TensorRTEngine { ...@@ -251,6 +121,54 @@ class TensorRTEngine {
using PredictorID = int; using PredictorID = int;
public: public:
/*
* Construction parameters of TensorRTEngine.
*/
struct ConstructionParams {
// The max batch size.
int32_t max_batch_size;
// The max memory size the engine uses.
int64_t max_workspace_size;
// The precision of engine.
phi::DataType precision{phi::DataType::FLOAT32};
TRTInt8Calibrator* calibrator{nullptr};
// Use for engine context memory sharing.
bool context_memory_sharing{false};
int device_id{0};
bool with_dynamic_shape{false};
bool use_dla{false};
int dla_core{0};
ShapeMapType min_input_shape;
ShapeMapType max_input_shape;
ShapeMapType optim_input_shape;
ShapeMapType min_shape_tensor;
ShapeMapType max_shape_tensor;
ShapeMapType optim_shape_tensor;
bool use_inspector{false};
//
// From tensorrt_subgraph_pass, only used for OpConverter.
//
bool use_varseqlen{false};
bool with_interleaved{false};
std::string tensorrt_transformer_posid;
std::string tensorrt_transformer_maskid;
bool enable_low_precision_io{false};
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
// not run fp16. When running fp16, the output accuracy of the model will be
// affected, closing the plugin fp16 may bring some improvement on accuracy.
bool disable_trt_plugin_fp16{false};
};
// Weight is model parameter. // Weight is model parameter.
class Weight { class Weight {
public: public:
...@@ -276,47 +194,14 @@ class TensorRTEngine { ...@@ -276,47 +194,14 @@ class TensorRTEngine {
nvinfer1::Weights w_; nvinfer1::Weights w_;
}; };
TensorRTEngine(int max_batch, TensorRTEngine(const ConstructionParams& params,
int64_t max_workspace,
phi::DataType precision = phi::DataType::FLOAT32,
TRTInt8Calibrator* calibrator = nullptr,
int device_id = 0,
bool with_dynamic_shape = false,
const ShapeMapType& min_input_shape = {},
const ShapeMapType& max_input_shape = {},
const ShapeMapType& optim_input_shape = {},
const ShapeMapType& min_shape_tensor = {},
const ShapeMapType& max_shape_tensor = {},
const ShapeMapType& optim_shape_tensor = {},
bool disable_trt_plugin_fp16 = false,
phi::DataType model_precision = phi::DataType::FLOAT32,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : params_(params), logger_(logger) {
max_workspace_(max_workspace), dy::initLibNvInferPlugins(&logger_, "");
precision_(precision), static std::once_flag trt_plugin_registered;
calibrator_(calibrator), std::call_once(trt_plugin_registered, []() {
device_id_(device_id), tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
with_dynamic_shape_(with_dynamic_shape), });
min_input_shape_(min_input_shape),
max_input_shape_(max_input_shape),
optim_input_shape_(optim_input_shape),
min_shape_tensor_(min_shape_tensor),
max_shape_tensor_(max_shape_tensor),
optim_shape_tensor_(optim_shape_tensor),
disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
model_precision_(model_precision),
logger_(logger) {
dy::initLibNvInferPlugins(&logger, "");
}
~TensorRTEngine() {
for (auto& attr : attrs_) {
if (attr_dels_.find(attr.first) != attr_dels_.end()) {
attr_dels_[attr.first]();
}
}
attrs_.clear();
attr_dels_.clear();
} }
// Add an input and set its name, data type and dimension. // Add an input and set its name, data type and dimension.
...@@ -346,15 +231,6 @@ class TensorRTEngine { ...@@ -346,15 +231,6 @@ class TensorRTEngine {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::IExecutionContext* context(); nvinfer1::IExecutionContext* context();
int GetProfileIndex() {
if (max_profile_num_ > 1) {
std::unique_lock<std::mutex> lock(mutex_);
return profile_index_[predictor_id_per_thread];
} else {
return 0;
}
}
int GetBindingsOffset() { int GetBindingsOffset() {
return (binding_num_ / max_profile_num_) * GetProfileIndex(); return (binding_num_ / max_profile_num_) * GetProfileIndex();
} }
...@@ -390,25 +266,20 @@ class TensorRTEngine { ...@@ -390,25 +266,20 @@ class TensorRTEngine {
void Deserialize(const std::string& engine_serialized_data); void Deserialize(const std::string& engine_serialized_data);
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
bool WithFp16() { bool WithFp16() {
bool enable_fp16 = (precision_ == phi::DataType::FLOAT16); bool enable_fp16 = (precision() == phi::DataType::FLOAT16);
bool support_fp16 = infer_builder_->platformHasFastFp16(); bool support_fp16 = infer_builder_->platformHasFastFp16();
// below is consistent with setFlag in engine.cc // below is consistent with setFlag in engine.cc
bool fall_back_fp16 = WithInt8() && !use_dla_; bool fall_back_fp16 = WithInt8() && !use_dla();
return (enable_fp16 || fall_back_fp16) && support_fp16; return (enable_fp16 || fall_back_fp16) && support_fp16;
} }
bool WithInt8() { bool WithInt8() {
bool enable_int8 = (precision_ == phi::DataType::INT8); bool enable_int8 = (precision() == phi::DataType::INT8);
bool support_int8 = infer_builder_->platformHasFastInt8(); bool support_int8 = infer_builder_->platformHasFastInt8();
return enable_int8 && support_int8; return enable_int8 && support_int8;
} }
int GetDeviceId() { return device_id_; }
nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs, nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs,
int num_inputs, int num_inputs,
plugin::PluginTensorRT*); plugin::PluginTensorRT*);
...@@ -471,19 +342,6 @@ class TensorRTEngine { ...@@ -471,19 +342,6 @@ class TensorRTEngine {
suffix_counter += 1; suffix_counter += 1;
} }
void SetUseOSS(bool use_varseqlen) { use_varseqlen_ = use_varseqlen; }
void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
void SetDLACore(int dla_core) { dla_core_ = dla_core; }
void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
void SetWithInterleaved(bool with_interleaved) {
with_interleaved_ = with_interleaved;
}
void SetTransformerPosid(std::string tensorrt_transformer_posid) {
tensorrt_transformer_posid_ = tensorrt_transformer_posid;
}
void SetTransformerMaskid(std::string tensorrt_transformer_maskid) {
tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
}
void ClearWeights() { void ClearWeights() {
for (auto& weight_pair : weight_map) { for (auto& weight_pair : weight_map) {
weight_pair.second.reset(nullptr); weight_pair.second.reset(nullptr);
...@@ -508,12 +366,12 @@ class TensorRTEngine { ...@@ -508,12 +366,12 @@ class TensorRTEngine {
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
ShapeMapType& min_input_shape() { return min_input_shape_; } ShapeMapType& min_input_shape() { return params_.min_input_shape; }
ShapeMapType& max_input_shape() { return max_input_shape_; } ShapeMapType& max_input_shape() { return params_.max_input_shape; }
ShapeMapType& optim_input_shape() { return optim_input_shape_; } ShapeMapType& optim_input_shape() { return params_.optim_input_shape; }
ShapeMapType& min_shape_tensor() { return min_shape_tensor_; } ShapeMapType& min_shape_tensor() { return params_.min_shape_tensor; }
ShapeMapType& max_shape_tensor() { return max_shape_tensor_; } ShapeMapType& max_shape_tensor() { return params_.max_shape_tensor; }
ShapeMapType& optim_shape_tensor() { return optim_shape_tensor_; } ShapeMapType& optim_shape_tensor() { return params_.optim_shape_tensor; }
bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape, bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
const ShapeMapType& runtime_shape_tensor, const ShapeMapType& runtime_shape_tensor,
...@@ -526,22 +384,22 @@ class TensorRTEngine { ...@@ -526,22 +384,22 @@ class TensorRTEngine {
auto name = it.first; auto name = it.first;
auto input_shape = it.second; auto input_shape = it.second;
// Make 0-D tensor to 1-D tensor. // Make 0-D tensor to 1-D tensor.
if (input_shape.size() == 0) { if (input_shape.empty()) {
input_shape.push_back(1); input_shape.push_back(1);
} }
bool min_change = false; bool min_change = false;
bool max_change = false; bool max_change = false;
std::vector<int> bak_min_shape; std::vector<int> bak_min_shape;
std::vector<int> bak_max_shape; std::vector<int> bak_max_shape;
if (!min_input_shape_.count(name)) { if (!params_.min_input_shape.count(name)) {
min_input_shape_[name] = input_shape; params_.min_input_shape[name] = input_shape;
max_input_shape_[name] = input_shape; params_.max_input_shape[name] = input_shape;
optim_input_shape_[name] = input_shape; params_.optim_input_shape[name] = input_shape;
min_change = true; min_change = true;
max_change = true; max_change = true;
ret = true; ret = true;
} else { } else {
PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), PADDLE_ENFORCE_EQ(params_.min_input_shape[name].size(),
input_shape.size(), input_shape.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"TRT dynamic_shape min_input_shape %s size not " "TRT dynamic_shape min_input_shape %s size not "
...@@ -549,33 +407,33 @@ class TensorRTEngine { ...@@ -549,33 +407,33 @@ class TensorRTEngine {
", but the runtime_input_shape[%s].size()=%d.", ", but the runtime_input_shape[%s].size()=%d.",
name, name,
name, name,
min_input_shape_[name].size(), params_.min_input_shape[name].size(),
name, name,
input_shape.size())); input_shape.size()));
bak_min_shape = min_input_shape_[name]; bak_min_shape = params_.min_input_shape[name];
bak_max_shape = max_input_shape_[name]; bak_max_shape = params_.max_input_shape[name];
for (size_t d = 0; d < input_shape.size(); ++d) { for (size_t d = 0; d < input_shape.size(); ++d) {
if (input_shape[d] < min_input_shape_[name][d]) { if (input_shape[d] < params_.min_input_shape[name][d]) {
ret = true; ret = true;
min_change = true; min_change = true;
min_input_shape_[name][d] = input_shape[d]; params_.min_input_shape[name][d] = input_shape[d];
} }
if (input_shape[d] > max_input_shape_[name][d]) { if (input_shape[d] > params_.max_input_shape[name][d]) {
ret = true; ret = true;
max_change = true; max_change = true;
max_input_shape_[name][d] = input_shape[d]; params_.max_input_shape[name][d] = input_shape[d];
} }
} }
} }
if (min_change) if (min_change)
LOG(INFO) << "refactor tensor shape range: " << name LOG(INFO) << "refactor tensor shape range: " << name
<< ", min_shape from " << Vec2Str(bak_min_shape) << " to " << ", min_shape from " << Vec2Str(bak_min_shape) << " to "
<< Vec2Str(min_input_shape_[name]); << Vec2Str(params_.min_input_shape[name]);
if (max_change) if (max_change)
LOG(INFO) << "refactor tensor shape range: " << name LOG(INFO) << "refactor tensor shape range: " << name
<< ", max_shape from " << Vec2Str(bak_max_shape) << " to " << ", max_shape from " << Vec2Str(bak_max_shape) << " to "
<< Vec2Str(max_input_shape_[name]); << Vec2Str(params_.max_input_shape[name]);
if (min_change || max_change) changed->push_back(name); if (min_change || max_change) changed->push_back(name);
} }
for (const auto& it : runtime_shape_tensor) { for (const auto& it : runtime_shape_tensor) {
...@@ -585,15 +443,15 @@ class TensorRTEngine { ...@@ -585,15 +443,15 @@ class TensorRTEngine {
bool max_change = false; bool max_change = false;
std::vector<int> bak_min_shape; std::vector<int> bak_min_shape;
std::vector<int> bak_max_shape; std::vector<int> bak_max_shape;
if (!min_shape_tensor_.count(name)) { if (!params_.min_shape_tensor.count(name)) {
min_shape_tensor_[name] = shape_tensor; params_.min_shape_tensor[name] = shape_tensor;
max_shape_tensor_[name] = shape_tensor; params_.max_shape_tensor[name] = shape_tensor;
optim_shape_tensor_[name] = shape_tensor; params_.optim_shape_tensor[name] = shape_tensor;
min_change = true; min_change = true;
max_change = true; max_change = true;
ret = true; ret = true;
} else { } else {
PADDLE_ENFORCE_EQ(min_shape_tensor_[name].size(), PADDLE_ENFORCE_EQ(params_.min_shape_tensor[name].size(),
shape_tensor.size(), shape_tensor.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"TRT dynamic_shape min_shape_tensor %s size not " "TRT dynamic_shape min_shape_tensor %s size not "
...@@ -601,50 +459,50 @@ class TensorRTEngine { ...@@ -601,50 +459,50 @@ class TensorRTEngine {
", but the runtime_shape_tensor[%s].size()=%d.", ", but the runtime_shape_tensor[%s].size()=%d.",
name, name,
name, name,
min_shape_tensor_[name].size(), params_.min_shape_tensor[name].size(),
name, name,
shape_tensor.size())); shape_tensor.size()));
bak_min_shape = min_shape_tensor_[name]; bak_min_shape = params_.min_shape_tensor[name];
bak_max_shape = max_shape_tensor_[name]; bak_max_shape = params_.max_shape_tensor[name];
for (size_t d = 0; d < shape_tensor.size(); ++d) { for (size_t d = 0; d < shape_tensor.size(); ++d) {
if (shape_tensor[d] < min_shape_tensor_[name][d]) { if (shape_tensor[d] < params_.min_shape_tensor[name][d]) {
ret = true; ret = true;
min_change = true; min_change = true;
min_shape_tensor_[name][d] = shape_tensor[d]; params_.min_shape_tensor[name][d] = shape_tensor[d];
} }
if (shape_tensor[d] > max_shape_tensor_[name][d]) { if (shape_tensor[d] > params_.max_shape_tensor[name][d]) {
ret = true; ret = true;
max_change = true; max_change = true;
max_shape_tensor_[name][d] = shape_tensor[d]; params_.max_shape_tensor[name][d] = shape_tensor[d];
} }
} }
} }
if (min_change) if (min_change)
LOG(INFO) << "refactor shape tensor range: " << name LOG(INFO) << "refactor shape tensor range: " << name
<< ", min_shape from " << Vec2Str(bak_min_shape) << " to " << ", min_shape from " << Vec2Str(bak_min_shape) << " to "
<< Vec2Str(min_shape_tensor_[name]); << Vec2Str(params_.min_shape_tensor[name]);
if (max_change) if (max_change)
LOG(INFO) << "refactor shape tensor range: " << name LOG(INFO) << "refactor shape tensor range: " << name
<< ", max_shape from " << Vec2Str(bak_max_shape) << " to " << ", max_shape from " << Vec2Str(bak_max_shape) << " to "
<< Vec2Str(max_shape_tensor_[name]); << Vec2Str(params_.max_shape_tensor[name]);
if (min_change || max_change) tensor_changed->push_back(name); if (min_change || max_change) tensor_changed->push_back(name);
} }
return ret; return ret;
} }
bool use_varseqlen() { return use_varseqlen_; } bool use_varseqlen() { return params_.use_varseqlen; }
bool with_ernie() { return with_ernie_; } bool use_dla() { return params_.use_dla; }
bool with_interleaved() { return with_interleaved_; } bool with_interleaved() { return params_.with_interleaved; }
std::string tensorrt_transformer_posid() { const std::string& tensorrt_transformer_posid() {
return tensorrt_transformer_posid_; return params_.tensorrt_transformer_posid;
} }
std::string tensorrt_transformer_maskid() { const std::string& tensorrt_transformer_maskid() {
return tensorrt_transformer_maskid_; return params_.tensorrt_transformer_maskid;
} }
bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool disable_trt_plugin_fp16() { return params_.disable_trt_plugin_fp16; }
bool with_dynamic_shape() { return with_dynamic_shape_; } bool with_dynamic_shape() { return params_.with_dynamic_shape; }
phi::DataType precision() { return precision_; } phi::DataType precision() { return params_.precision; }
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
nvinfer1::IPluginV2Layer* AddDynamicPlugin( nvinfer1::IPluginV2Layer* AddDynamicPlugin(
...@@ -656,153 +514,50 @@ class TensorRTEngine { ...@@ -656,153 +514,50 @@ class TensorRTEngine {
} }
#endif #endif
bool Has(const std::string& attr_name) const {
return attrs_.count(attr_name) > 0;
}
void Erase(const std::string& attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}
// Set a pointer to the attribute. Engine takes ownership of the attribute.
template <typename AttrType>
void Set(const std::string& attr_name, AttrType* attr) {
if (attrs_.count(attr_name) == 0) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name),
0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
} else {
VLOG(3) << "Setting the attribute " << attr_name << " for trt engine "
<< this;
}
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}
// Set a pointer to the attribute. Engine doesn't take ownership. Caller
// should delete the attribute.
template <typename AttrType>
void SetNotOwned(const std::string& attr_name, AttrType* attr) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name),
0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
attrs_[attr_name] = attr;
}
// Get a reference to the attributed previously set.
template <typename AttrType>
AttrType& Get(const std::string& attr_name) const {
PADDLE_ENFORCE_NE(attrs_.find(attr_name),
attrs_.end(),
platform::errors::InvalidArgument(
"Attribute %s not found in trt engine.", attr_name));
try {
return *paddle::any_cast<AttrType*>(attrs_.at(attr_name));
} catch (paddle::bad_any_cast&) {
auto TypeToString = [](const std::type_info& info) -> std::string {
if (std::type_index(info) == std::type_index(typeid(bool*))) {
return "bool";
} else if (std::type_index(info) == std::type_index(typeid(int*))) {
return "int";
} else if (std::type_index(info) ==
std::type_index(typeid(const int*))) {
return "const int";
} else if (std::type_index(info) ==
std::type_index(typeid(std::string*))) {
return "std::string";
}
return info.name();
};
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid type for attritube %s, expected: %s, actual: %s.",
attr_name,
TypeToString(typeid(AttrType*)),
TypeToString(attrs_.at(attr_name).type())));
}
}
void SetProfileNum(int num) { max_profile_num_ = num; } void SetProfileNum(int num) { max_profile_num_ = num; }
void GetEngineInfo(); void SetScope(const framework::Scope* scope) { scope_ = scope; }
void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
void SetScope(const framework::Scope& scope) { scope_ = &scope; }
void SetContextMemorySharing(bool context_memory_sharing) {
context_memory_sharing_ = context_memory_sharing;
}
void SetLowPrecisionIO(bool low_precision_io) {
low_precision_io_ = low_precision_io;
}
bool EnableLowPrecisionIO() const { return low_precision_io_; }
void SetAllNodesLowerToTrt(bool all_nodes_offload_to_trt) { void SetAllNodesLowerToTrt(bool all_nodes_offload_to_trt) {
// all nodes are in trt, so we can use cudaGraph to optimize runtime. // all nodes are in trt, so we can use cudaGraph to optimize runtime.
startup_with_cudagraph_ = all_nodes_offload_to_trt; startup_with_cudagraph_ = all_nodes_offload_to_trt;
} }
bool LowPrecisionIOEnabled() const { return params_.enable_low_precision_io; }
private: private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated, // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling // ensure that the thread is associated with the correct device by calling
// freshDeviceId(). // FreshDeviceId().
void freshDeviceId(); void FreshDeviceId();
// Used for convert weight into Itensor
const framework::Scope* scope_; void GetEngineInfo();
// the max batch size int device_id() { return params_.device_id; }
int max_batch_;
// the runtime batch size
static int runtime_batch_;
// the max memory size the engine uses
int64_t max_workspace_;
phi::DataType precision_; int GetProfileIndex() {
TRTInt8Calibrator* calibrator_; if (max_profile_num_ > 1) {
// batch size of the current data, will be updated each Executation. std::unique_lock<std::mutex> lock(mutex_);
int batch_size_{-1}; return profile_index_[predictor_id_per_thread];
} else {
return 0;
}
}
// use for engine context memory sharing private:
bool context_memory_sharing_{false}; //
// Construction parameters.
//
ConstructionParams params_;
bool low_precision_io_{false}; //
// The following are runtime parameters.
//
int device_id_;
int max_profile_num_{1}; int max_profile_num_{1};
int cur_profile_num_{0}; int cur_profile_num_{0};
std::unordered_map<PredictorID, int> profile_index_; std::unordered_map<PredictorID, int> profile_index_;
bool with_dynamic_shape_{false};
ShapeMapType min_input_shape_;
ShapeMapType max_input_shape_;
ShapeMapType optim_input_shape_;
ShapeMapType min_shape_tensor_;
ShapeMapType max_shape_tensor_;
ShapeMapType optim_shape_tensor_;
bool disable_trt_plugin_fp16_{false};
phi::DataType model_precision_{phi::DataType::FLOAT32};
bool use_varseqlen_{false};
bool use_dla_{false};
int dla_core_{0};
bool with_ernie_{false};
bool with_interleaved_{false};
std::string tensorrt_transformer_posid_;
std::string tensorrt_transformer_maskid_;
nvinfer1::ILogger& logger_; nvinfer1::ILogger& logger_;
// max data size for the buffers. // max data size for the buffers.
...@@ -828,8 +583,9 @@ class TensorRTEngine { ...@@ -828,8 +583,9 @@ class TensorRTEngine {
bool cudagraph_inited_{false}; bool cudagraph_inited_{false};
bool startup_with_cudagraph_{false}; bool startup_with_cudagraph_{false};
std::unordered_map<std::string, paddle::any> attrs_; // Used for convert weight into Itensor
std::unordered_map<std::string, std::function<void(void)>> attr_dels_; const framework::Scope* scope_{nullptr};
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
int binding_num_; int binding_num_;
infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_; infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
...@@ -837,7 +593,6 @@ class TensorRTEngine { ...@@ -837,7 +593,6 @@ class TensorRTEngine {
std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_; std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
#endif #endif
std::mutex mutex_; std::mutex mutex_;
bool use_inspector_;
public: public:
thread_local static int predictor_id_per_thread; thread_local static int predictor_id_per_thread;
...@@ -872,7 +627,7 @@ class TRTEngineManager { ...@@ -872,7 +627,7 @@ class TRTEngineManager {
bool Empty() const { bool Empty() const {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return engines_.size() == 0; return engines_.empty();
} }
bool Has(const std::string& name) const { bool Has(const std::string& name) const {
...@@ -886,41 +641,13 @@ class TRTEngineManager { ...@@ -886,41 +641,13 @@ class TRTEngineManager {
return engines_.at(name).get(); return engines_.at(name).get();
} }
TensorRTEngine* Create( TensorRTEngine* Create(const std::string& name,
std::string name, const TensorRTEngine::ConstructionParams& params,
int max_batch,
int64_t max_workspace,
phi::DataType precision = phi::DataType::FLOAT32,
TRTInt8Calibrator* calibrator = nullptr,
int device_id = 0,
bool with_dynamic_shape = false,
const std::map<std::string, std::vector<int>> min_input_shape = {},
const std::map<std::string, std::vector<int>> max_input_shape = {},
const std::map<std::string, std::vector<int>> optim_input_shape = {},
const std::map<std::string, std::vector<int>> min_shape_tensor = {},
const std::map<std::string, std::vector<int>> max_shape_tensor = {},
const std::map<std::string, std::vector<int>> optim_shape_tensor = {},
bool disable_trt_plugin_fp16 = false,
phi::DataType model_precision = phi::DataType::FLOAT32,
nvinfer1::ILogger& logger = NaiveLogger::Global()) { nvinfer1::ILogger& logger = NaiveLogger::Global()) {
auto* p = new TensorRTEngine(max_batch, auto engine = std::make_unique<TensorRTEngine>(params, logger);
max_workspace,
precision,
calibrator,
device_id,
with_dynamic_shape,
min_input_shape,
max_input_shape,
optim_input_shape,
min_shape_tensor,
max_shape_tensor,
optim_shape_tensor,
disable_trt_plugin_fp16,
model_precision,
logger);
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
engines_[name].reset(p); engines_[name].reset(engine.release());
return p; return engines_[name].get();
} }
void DeleteAll() { void DeleteAll() {
...@@ -940,7 +667,7 @@ class TRTEngineManager { ...@@ -940,7 +667,7 @@ class TRTEngineManager {
} }
} }
void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) { void UpdateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
VLOG(3) << "TensorRT engine context memory size is " VLOG(3) << "TensorRT engine context memory size is "
<< mem_size / 1024.0 / 1024.0 << "MiB in predictor id " << mem_size / 1024.0 / 1024.0 << "MiB in predictor id "
<< predictor_id; << predictor_id;
...@@ -955,24 +682,24 @@ class TRTEngineManager { ...@@ -955,24 +682,24 @@ class TRTEngineManager {
} }
if (size_updated) { if (size_updated) {
releaseContextMemory(predictor_id); ReleaseContextMemory(predictor_id);
} }
} }
void* getContextMemory(PredictorID predictor_id, void* GetContextMemory(PredictorID predictor_id,
const phi::GPUPlace& place, const phi::GPUPlace& place,
const phi::Stream& stream) { const phi::Stream& stream) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
static auto alignment = getAlignmentSize(place); static auto alignment = GetAlignmentSize(place);
if (context_memorys_.count(predictor_id) == 0) { if (context_memorys_.count(predictor_id) == 0) {
auto context_memory = auto context_memory =
memory::Alloc(place, max_ctx_mem_size_ + alignment, stream); memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
context_memorys_[predictor_id] = std::move(context_memory); context_memorys_[predictor_id] = std::move(context_memory);
} }
return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment); return GetAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
} }
void releaseContextMemory(PredictorID predictor_id) { void ReleaseContextMemory(PredictorID predictor_id) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
if (context_memorys_.count(predictor_id)) { if (context_memorys_.count(predictor_id)) {
context_memorys_[predictor_id].reset(nullptr); context_memorys_[predictor_id].reset(nullptr);
...@@ -981,12 +708,12 @@ class TRTEngineManager { ...@@ -981,12 +708,12 @@ class TRTEngineManager {
} }
private: private:
size_t getAlignmentSize(const phi::GPUPlace& place) { size_t GetAlignmentSize(const phi::GPUPlace& place) {
const auto& prop = platform::GetDeviceProperties(place.GetDeviceId()); const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
return prop.textureAlignment; return prop.textureAlignment;
} }
void* getAlignedMemory(void* addr, size_t alignment) { void* GetAlignedMemory(void* addr, size_t alignment) {
return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1))); return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
} }
......
...@@ -148,7 +148,7 @@ class NaiveProfiler : public nvinfer1::IProfiler { ...@@ -148,7 +148,7 @@ class NaiveProfiler : public nvinfer1::IProfiler {
typedef std::pair<std::string, float> Record; typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile; std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT { void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT override {
auto record = auto record =
std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r) { std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r) {
return r.first == layerName; return r.first == layerName;
...@@ -235,6 +235,130 @@ static inline nvinfer1::DataType PhiType2NvType(phi::DataType type) { ...@@ -235,6 +235,130 @@ static inline nvinfer1::DataType PhiType2NvType(phi::DataType type) {
return nv_type; return nv_type;
} }
using FluidDT = paddle::framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
static TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
case FluidDT::VarType_Type_FP64:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
case FluidDT::VarType_Type_INT64:
return TRT_DT::kINT32;
case FluidDT::VarType_Type_FP16:
return TRT_DT::kHALF;
#if IS_TRT_VERSION_GE(8400)
case FluidDT::VarType_Type_BOOL:
return TRT_DT::kBOOL;
#endif
default:
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"unsupported datatype in TRT op converter, type: %s. "
"Boolean type is supported as TRT input/output "
"using TensorRT v8.4+.",
VarType_Type_Name(type)));
}
return TRT_DT::kINT32;
}
// The T can be int32 or int64 type.
template <typename T>
static nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
std::string input,
bool with_dynamic_shape = false) {
PADDLE_ENFORCE_GE(shape.size(),
0UL,
paddle::platform::errors::InvalidArgument(
"TensorRT's tensor input requires at least 0 "
"dimensions, but input %s has %d dims.",
input,
shape.size()));
auto ShapeStr = [](const std::vector<T>& shape) {
std::ostringstream os;
os << "[";
for (size_t i = 0; i < shape.size(); ++i) {
if (i == shape.size() - 1) {
os << shape[i];
} else {
os << shape[i] << ",";
}
}
os << "]";
return os.str();
};
if (!with_dynamic_shape) {
if (shape.size() == 4UL) {
if (shape[2] == -1 || shape[3] == -1) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims3(shape[1], shape[2], shape[3]);
} else if (shape.size() == 5UL) {
if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]);
} else if (shape.size() == 3UL) {
if (shape[1] == -1 || shape[2] == -1) {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
return nvinfer1::Dims2(shape[1], shape[2]);
} else if (shape.size() == 2UL) {
if (shape[1] == -1) {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s, please enable "
"trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
input,
ShapeStr(shape)));
}
nvinfer1::Dims dims;
dims.nbDims = 1;
dims.d[0] = shape[1];
return dims;
}
// static shape doesn't support 1D op so far.
PADDLE_ENFORCE_NE(shape.size(),
1UL,
paddle::platform::errors::InvalidArgument(
"The input [%s] shape of trt subgraph is %s."
"it's not supported by trt so far",
input,
ShapeStr(shape)));
nvinfer1::Dims dims;
dims.nbDims = shape.size() - 1;
for (size_t i = 1; i < shape.size(); i++) {
dims.d[i - 1] = shape[i];
}
return dims;
} else {
if (shape.size() == 4UL) {
return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
} else if (shape.size() == 3UL) {
return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
}
nvinfer1::Dims dims;
dims.nbDims = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
dims.d[i] = shape[i];
}
return dims;
}
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
#include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
#endif #endif
...@@ -35,7 +36,7 @@ namespace tensorrt { ...@@ -35,7 +36,7 @@ namespace tensorrt {
class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -65,29 +66,21 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { ...@@ -65,29 +66,21 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
{"shape", {18, 8, 4}}}; {"shape", {18, 8, 4}}};
std::map<std::string, std::vector<int>> optim_input_value = { std::map<std::string, std::vector<int>> optim_input_value = {
{"shape", {18, 8, 4}}}; {"shape", {18, 8, 4}}};
engine_ = new TensorRTEngine(16,
1 << 10,
phi::DataType::FLOAT32,
nullptr,
0,
true,
min_input_shape,
max_input_shape,
optim_input_shape,
min_input_value,
max_input_value,
optim_input_value,
false,
phi::DataType::FLOAT32,
NaiveLogger::Global());
engine_->InitNetwork();
}
void TearDown() override { TensorRTEngine::ConstructionParams params;
if (engine_) { params.max_batch_size = 16;
delete engine_; params.max_workspace_size = 1 << 10;
engine_ = nullptr; params.with_dynamic_shape = true;
} params.min_input_shape = min_input_shape;
params.max_input_shape = max_input_shape;
params.optim_input_shape = optim_input_shape;
params.min_shape_tensor = min_input_value;
params.max_shape_tensor = max_input_value;
params.optim_shape_tensor = optim_input_value;
engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
engine_->InitNetwork();
} }
void PrepareInputOutput(const std::vector<float> &input, void PrepareInputOutput(const std::vector<float> &input,
...@@ -106,8 +99,8 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { ...@@ -106,8 +99,8 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
phi::DenseTensor input_; phi::DenseTensor input_;
phi::DenseTensor shape_; phi::DenseTensor shape_;
phi::DenseTensor output_; phi::DenseTensor output_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; std::unique_ptr<phi::GPUContext> ctx_;
}; };
TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
...@@ -167,7 +160,7 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { ...@@ -167,7 +160,7 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
class TensorRTDynamicEngineTest : public ::testing::Test { class TensorRTDynamicEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -192,29 +185,18 @@ class TensorRTDynamicEngineTest : public ::testing::Test { ...@@ -192,29 +185,18 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
std::map<std::string, std::vector<int>> optim_input_shape = { std::map<std::string, std::vector<int>> optim_input_shape = {
{"input", {16, 32, 1, 1}}}; {"input", {16, 32, 1, 1}}};
engine_ = new TensorRTEngine(16, TensorRTEngine::ConstructionParams params;
1 << 10, params.max_batch_size = 16;
phi::DataType::FLOAT16, params.max_workspace_size = 1 << 10;
nullptr, params.with_dynamic_shape = true;
0, params.precision = phi::DataType::FLOAT16;
true, params.min_input_shape = min_input_shape;
min_input_shape, params.max_input_shape = max_input_shape;
max_input_shape, params.optim_input_shape = optim_input_shape;
optim_input_shape,
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
false,
phi::DataType::FLOAT32,
NaiveLogger::Global());
engine_->InitNetwork();
}
void TearDown() override { engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
if (engine_) {
delete engine_; engine_->InitNetwork();
engine_ = nullptr;
}
} }
void PrepareInputOutput(const std::vector<float16> &input, void PrepareInputOutput(const std::vector<float16> &input,
...@@ -230,8 +212,8 @@ class TensorRTDynamicEngineTest : public ::testing::Test { ...@@ -230,8 +212,8 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
protected: protected:
phi::DenseTensor input_; phi::DenseTensor input_;
phi::DenseTensor output_; phi::DenseTensor output_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; std::unique_ptr<phi::GPUContext> ctx_;
}; };
TEST_F(TensorRTDynamicEngineTest, test_spmm) { TEST_F(TensorRTDynamicEngineTest, test_spmm) {
...@@ -336,7 +318,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { ...@@ -336,7 +318,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -370,29 +352,18 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { ...@@ -370,29 +352,18 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
{"mask", {4, 1, 4, 4}}, {"mask", {4, 1, 4, 4}},
{"new_mask", {4, 1, 2, 2}}}; {"new_mask", {4, 1, 2, 2}}};
engine_ = new TensorRTEngine(16, TensorRTEngine::ConstructionParams params;
1 << 10, params.max_batch_size = 16;
phi::DataType::FLOAT32, params.max_workspace_size = 1 << 10;
nullptr, params.precision = phi::DataType::FLOAT32;
0, params.with_dynamic_shape = true;
true, params.min_input_shape = min_input_shape;
min_input_shape, params.max_input_shape = max_input_shape;
max_input_shape, params.optim_input_shape = optim_input_shape;
optim_input_shape,
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
false,
phi::DataType::FLOAT32,
NaiveLogger::Global());
engine_->InitNetwork();
}
void TearDown() override { engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
if (engine_) {
delete engine_; engine_->InitNetwork();
engine_ = nullptr;
}
} }
void PrepareInputOutput(const std::vector<std::vector<float>> inputs, void PrepareInputOutput(const std::vector<std::vector<float>> inputs,
...@@ -419,13 +390,12 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { ...@@ -419,13 +390,12 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
protected: protected:
std::vector<phi::DenseTensor> inputs_; std::vector<phi::DenseTensor> inputs_;
std::vector<phi::DenseTensor> outputs_; std::vector<phi::DenseTensor> outputs_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; std::unique_ptr<phi::GPUContext> ctx_;
}; };
TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
#if IS_TRT_VERSION_GE(8000) #if IS_TRT_VERSION_GE(8000)
tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
auto *attn = engine_->DeclareInput( auto *attn = engine_->DeclareInput(
"attn", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 4}); "attn", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 4});
auto *x = engine_->DeclareInput( auto *x = engine_->DeclareInput(
...@@ -545,7 +515,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { ...@@ -545,7 +515,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -579,29 +549,17 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { ...@@ -579,29 +549,17 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
{"mask", {4, 1, 4, 4}}, {"mask", {4, 1, 4, 4}},
{"new_mask", {4, 1, 2, 2}}}; {"new_mask", {4, 1, 2, 2}}};
engine_ = new TensorRTEngine(16, TensorRTEngine::ConstructionParams params;
1 << 10, params.max_batch_size = 16;
phi::DataType::FLOAT16, params.max_workspace_size = 1 << 10;
nullptr, params.precision = phi::DataType::FLOAT16;
0, params.with_dynamic_shape = true;
true, params.min_input_shape = min_input_shape;
min_input_shape, params.max_input_shape = max_input_shape;
max_input_shape, params.optim_input_shape = optim_input_shape;
optim_input_shape,
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
std::map<std::string, std::vector<int>>(),
false,
phi::DataType::FLOAT32,
NaiveLogger::Global());
engine_->InitNetwork();
}
void TearDown() override { engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
if (engine_) { engine_->InitNetwork();
delete engine_;
engine_ = nullptr;
}
} }
void PrepareInputOutput(const std::vector<std::vector<float16>> inputs, void PrepareInputOutput(const std::vector<std::vector<float16>> inputs,
...@@ -628,13 +586,12 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test { ...@@ -628,13 +586,12 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
protected: protected:
std::vector<phi::DenseTensor> inputs_; std::vector<phi::DenseTensor> inputs_;
std::vector<phi::DenseTensor> outputs_; std::vector<phi::DenseTensor> outputs_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; std::unique_ptr<phi::GPUContext> ctx_;
}; };
TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) { TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) {
#if IS_TRT_VERSION_GE(8000) #if IS_TRT_VERSION_GE(8000)
tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
auto *attn = engine_->DeclareInput( auto *attn = engine_->DeclareInput(
"attn", nvinfer1::DataType::kHALF, nvinfer1::Dims2{-1, 4}); "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims2{-1, 4});
auto *x = engine_->DeclareInput( auto *x = engine_->DeclareInput(
...@@ -754,7 +711,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) { ...@@ -754,7 +711,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) {
class TensorRTDynamicShapeGNTest : public ::testing::Test { class TensorRTDynamicShapeGNTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get()); .get());
...@@ -782,29 +739,18 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { ...@@ -782,29 +739,18 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
std::map<std::string, std::vector<int>> max_input_value = {}; std::map<std::string, std::vector<int>> max_input_value = {};
std::map<std::string, std::vector<int>> optim_input_value = {}; std::map<std::string, std::vector<int>> optim_input_value = {};
engine_ = new TensorRTEngine(16, TensorRTEngine::ConstructionParams params;
1 << 10, params.max_batch_size = 16;
phi::DataType::INT8, params.max_workspace_size = 1 << 10;
nullptr, params.precision = phi::DataType::INT8;
0, params.with_dynamic_shape = true;
true, params.min_input_shape = min_input_shape;
min_input_shape, params.max_input_shape = max_input_shape;
max_input_shape, params.optim_input_shape = optim_input_shape;
optim_input_shape,
min_input_value,
max_input_value,
optim_input_value,
false,
phi::DataType::FLOAT32,
NaiveLogger::Global());
engine_->InitNetwork();
}
void TearDown() override { engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
if (engine_) {
delete engine_; engine_->InitNetwork();
engine_ = nullptr;
}
} }
void PrepareInputOutput(const std::vector<float> &input, void PrepareInputOutput(const std::vector<float> &input,
...@@ -923,8 +869,8 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { ...@@ -923,8 +869,8 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
protected: protected:
phi::DenseTensor x_; phi::DenseTensor x_;
phi::DenseTensor y_; phi::DenseTensor y_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; std::unique_ptr<phi::GPUContext> ctx_;
// case from SD // case from SD
int n_ = 2; int n_ = 2;
int c_ = 320; int c_ = 320;
...@@ -942,8 +888,6 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test { ...@@ -942,8 +888,6 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
/* /*
TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) { TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) {
tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
float *bias = new float[c_]; float *bias = new float[c_];
float *scale = new float[c_]; float *scale = new float[c_];
for (int i = 0; i < c_; i++) { for (int i = 0; i < c_; i++) {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
...@@ -48,17 +49,13 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -48,17 +49,13 @@ class TensorRTEngineTest : public ::testing::Test {
.get()); .get());
ctx_->PartialInitWithAllocator(); ctx_->PartialInitWithAllocator();
engine_ = new TensorRTEngine(10, 1 << 10); TensorRTEngine::ConstructionParams params;
params.max_batch_size = 10;
params.max_workspace_size = 1 << 10;
engine_ = std::make_unique<TensorRTEngine>(params);
engine_->InitNetwork(); engine_->InitNetwork();
} }
void TearDown() override {
if (engine_) {
delete engine_;
engine_ = nullptr;
}
}
void PrepareInputOutput(const std::vector<float> &input, void PrepareInputOutput(const std::vector<float> &input,
std::vector<int> output_shape) { std::vector<int> output_shape) {
paddle::framework::TensorFromVector(input, *ctx_, &input_); paddle::framework::TensorFromVector(input, *ctx_, &input_);
...@@ -72,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -72,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
phi::DenseTensor input_; phi::DenseTensor input_;
phi::DenseTensor output_; phi::DenseTensor output_;
TensorRTEngine *engine_; std::unique_ptr<TensorRTEngine> engine_;
phi::GPUContext *ctx_; phi::GPUContext *ctx_;
}; };
...@@ -111,15 +108,6 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -111,15 +108,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data); buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data); buffers[1] = reinterpret_cast<void *>(y_gpu_data);
LOG(INFO) << "Set attr";
engine_->Set("test_attr", new std::string("test_attr"));
if (engine_->Has("test_attr")) {
auto attr_val = engine_->Get<std::string>("test_attr");
engine_->Erase("test_attr");
}
std::string *attr_key = new std::string("attr_key");
engine_->SetNotOwned("attr1", attr_key);
LOG(INFO) << "to execute"; LOG(INFO) << "to execute";
engine_->Execute(1, &buffers, ctx_->stream()); engine_->Execute(1, &buffers, ctx_->stream());
...@@ -128,8 +116,6 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -128,8 +116,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
delete attr_key;
} }
TEST_F(TensorRTEngineTest, add_layer_multi_dim) { TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
......
...@@ -43,7 +43,7 @@ class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { ...@@ -43,7 +43,7 @@ class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
const platform::Place place); const platform::Place place);
explicit TRTInt8Calibrator(const std::string& calibration_data); explicit TRTInt8Calibrator(const std::string& calibration_data);
~TRTInt8Calibrator(); ~TRTInt8Calibrator() override;
int getBatchSize() const TRT_NOEXCEPT override; int getBatchSize() const TRT_NOEXCEPT override;
...@@ -91,7 +91,7 @@ class TRTCalibratorEngine { ...@@ -91,7 +91,7 @@ class TRTCalibratorEngine {
*/ */
class TRTCalibratorEngineManager { class TRTCalibratorEngineManager {
public: public:
bool Has() const { return res_.size() > 0; } bool Has() const { return !res_.empty(); }
bool Has(const std::string& name) const { bool Has(const std::string& name) const {
if (res_.count(name) == 0) return false; if (res_.count(name) == 0) return false;
return res_.at(name).get() != nullptr; return res_.at(name).get() != nullptr;
......
...@@ -14,30 +14,34 @@ ...@@ -14,30 +14,34 @@
#pragma once #pragma once
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/funcs/data_type_transform.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cstdint>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/fluid/framework/data_device_transform.h" #include "paddle/fluid/framework/data_device_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/funcs/data_type_transform.h"
#include "paddle/utils/string/string_helper.h" #include "paddle/utils/string/string_helper.h"
namespace paddle { namespace paddle {
...@@ -171,10 +175,6 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -171,10 +175,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
std::string model_opt_cache_dir_; std::string model_opt_cache_dir_;
bool use_static_engine_; bool use_static_engine_;
phi::DataType precision_mode_; phi::DataType precision_mode_;
std::map<std::string, std::vector<int>> min_input_shape_{};
std::map<std::string, std::vector<int>> max_input_shape_{};
std::map<std::string, std::vector<int>> opt_input_shape_{};
phi::DataType model_precision_{phi::DataType::FLOAT32};
public: public:
TensorRTEngineOp(const std::string &type, TensorRTEngineOp(const std::string &type,
...@@ -185,7 +185,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -185,7 +185,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
input_names_ = Inputs("Xs"); input_names_ = Inputs("Xs");
max_batch_size_ = Attr<int>("max_batch_size"); max_batch_size_ = Attr<int>("max_batch_size");
workspace_size_ = Attr<int64_t>("workspace_size"); workspace_size_ = Attr<int64_t>("workspace_size");
device_id_ = Attr<int>("gpu_id"); device_id_ = Attr<int>("gpu_device_id");
enable_int8_ = Attr<bool>("enable_int8"); enable_int8_ = Attr<bool>("enable_int8");
enable_fp16_ = Attr<bool>("enable_fp16"); enable_fp16_ = Attr<bool>("enable_fp16");
use_calib_mode_ = Attr<bool>("use_calib_mode"); use_calib_mode_ = Attr<bool>("use_calib_mode");
...@@ -200,43 +200,6 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -200,43 +200,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
if (use_static_engine_) { if (use_static_engine_) {
model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir"); model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
} }
model_precision_ = static_cast<phi::DataType>(Attr<int>("model_precision"));
if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
std::vector<std::string> dynamic_shape_names;
std::vector<std::vector<int>> min_input_shapes;
std::vector<std::vector<int>> max_input_shapes;
std::vector<std::vector<int>> opt_input_shapes;
std::vector<int> dynamic_shape_lens;
dynamic_shape_names =
Attr<std::vector<std::string>>("dynamic_shape_names");
std::vector<int> min_shapes = Attr<std::vector<int>>("min_input_shape");
std::vector<int> max_shapes = Attr<std::vector<int>>("max_input_shape");
std::vector<int> opt_shapes = Attr<std::vector<int>>("opt_input_shape");
dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
int idx = 0;
for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
std::vector<int> tmp1, tmp2, tmp3;
for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
tmp1.push_back(min_shapes[idx]);
tmp2.push_back(max_shapes[idx]);
tmp3.push_back(opt_shapes[idx++]);
}
min_input_shapes.emplace_back(tmp1);
max_input_shapes.emplace_back(tmp2);
opt_input_shapes.emplace_back(tmp3);
}
for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
min_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
max_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
opt_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
}
}
auto params = Attr<std::vector<std::string>>("parameters"); auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) { for (const auto &param : params) {
...@@ -249,11 +212,11 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -249,11 +212,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
// calibration_mode is true represents we need to // calibration_mode is true represents we need to
// generate the calibration table data. // generate the calibration table data.
calibration_mode_ = calibration_mode_ =
(enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_); (enable_int8_ && calibration_data_.empty() && use_calib_mode_);
VLOG(4) << "calibration_mode: " << calibration_mode_; VLOG(4) << "calibration_mode: " << calibration_mode_;
if (enable_int8_ && calibration_data_.size()) { if (enable_int8_ && !calibration_data_.empty()) {
calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); calibrator_ = std::make_unique<TRTInt8Calibrator>(calibration_data_);
} }
bool has_engine = bool has_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
...@@ -486,36 +449,29 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -486,36 +449,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto t_shape = phi::vectorize(t.dims()); auto t_shape = phi::vectorize(t.dims());
runtime_batch = t_shape[0]; runtime_batch = t_shape[0];
} }
calib_res->calib_.reset(new TRTInt8Calibrator( calib_res->calib_ = std::make_unique<TRTInt8Calibrator>(
calib_buffers, runtime_batch, calibration_engine_key_, dev_place)); calib_buffers, runtime_batch, calibration_engine_key_, dev_place);
calib_res->thr_.reset(new std::thread([&]() { calib_res->thr_.reset(new std::thread([&]() {
std::map<std::string, std::vector<int>> min_input_shape; TensorRTEngine::ConstructionParams params;
std::map<std::string, std::vector<int>> max_input_shape; params.max_batch_size = max_batch_size_;
std::map<std::string, std::vector<int>> opt_input_shape; params.max_workspace_size = workspace_size_;
std::map<std::string, std::vector<int>> min_shape_tensor; params.precision = precision_mode_;
std::map<std::string, std::vector<int>> max_shape_tensor; params.calibrator = calib_res->calib_.get();
std::map<std::string, std::vector<int>> opt_shape_tensor; params.device_id = dev_place.device;
if (shape_range_info_path_.size()) params.with_dynamic_shape = with_dynamic_shape_;
if (!shape_range_info_path_.empty()) {
inference::DeserializeShapeRangeInfo(shape_range_info_path_, inference::DeserializeShapeRangeInfo(shape_range_info_path_,
&min_input_shape, &params.min_input_shape,
&max_input_shape, &params.max_input_shape,
&opt_input_shape, &params.optim_input_shape,
&min_shape_tensor, &params.min_shape_tensor,
&max_shape_tensor, &params.max_shape_tensor,
&opt_shape_tensor); &params.optim_shape_tensor);
}
calib_res->engine_.reset(new TensorRTEngine(max_batch_size_, params.context_memory_sharing = Attr<bool>("context_memory_sharing");
workspace_size_, params.enable_low_precision_io = Attr<bool>("enable_low_precision_io");
precision_mode_, calib_res->engine_ = std::make_unique<TensorRTEngine>(params);
calib_res->calib_.get(),
dev_place.device,
with_dynamic_shape_,
min_input_shape,
max_input_shape,
opt_input_shape,
min_shape_tensor,
max_shape_tensor,
opt_shape_tensor));
VLOG(3) << "start the calib trt engine thread"; VLOG(3) << "start the calib trt engine thread";
PrepareTRTEngine(scope, calib_res->engine_.get()); PrepareTRTEngine(scope, calib_res->engine_.get());
})); }));
...@@ -597,7 +553,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -597,7 +553,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
// This must be a zero dimension tensor. // This must be a zero dimension tensor.
// At present, we convert it to a 1D tensor to feed them into Trt. // At present, we convert it to a 1D tensor to feed them into Trt.
if (t_shape.size() == 0) { if (t_shape.empty()) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
t.numel(), t.numel(),
1UL, 1UL,
...@@ -669,8 +625,12 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -669,8 +625,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
t.numel() * sizeof(int), t.numel() * sizeof(int),
nullptr); nullptr);
} else if (t.dtype() == phi::DataType::INT64) { } else if (t.dtype() == phi::DataType::INT64) {
auto int32_tensor = scope.FindVar(x + "_cast_to_INT32") std::string x_t = x + "_cast_to_INT32";
->GetMutable<phi::DenseTensor>(); if (scope.FindVar(x_t) == nullptr) {
const_cast<framework::Scope *>(&scope)->Var(x_t);
}
auto int32_tensor =
scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
*int32_tensor = phi::Cast<int64_t>( *int32_tensor = phi::Cast<int64_t>(
reinterpret_cast<const phi::GPUContext &>(dev_ctx), reinterpret_cast<const phi::GPUContext &>(dev_ctx),
t, t,
...@@ -703,16 +663,22 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -703,16 +663,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
if (t.dtype() == phi::DataType::FLOAT32) { if (t.dtype() == phi::DataType::FLOAT32) {
buffers[bind_index] = static_cast<void *>(t.data<float>()); buffers[bind_index] = static_cast<void *>(t.data<float>());
} else if (t.dtype() == phi::DataType::FLOAT64) { } else if (t.dtype() == phi::DataType::FLOAT64) {
auto fp32_tensor = std::string x_t = x + "_cast_to_FP32";
scope.FindVar(x + "_cast_to_FP32")->GetMutable<phi::DenseTensor>(); if (scope.FindVar(x_t) == nullptr) {
const_cast<framework::Scope *>(&scope)->Var(x_t);
}
auto fp32_tensor = scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
*fp32_tensor = phi::Cast<double>( *fp32_tensor = phi::Cast<double>(
reinterpret_cast<const phi::GPUContext &>(dev_ctx), reinterpret_cast<const phi::GPUContext &>(dev_ctx),
t, t,
phi::DataType::FLOAT32); phi::DataType::FLOAT32);
buffers[bind_index] = static_cast<void *>(fp32_tensor->data<float>()); buffers[bind_index] = static_cast<void *>(fp32_tensor->data<float>());
} else if (t.dtype() == phi::DataType::INT64) { } else if (t.dtype() == phi::DataType::INT64) {
auto int32_tensor = std::string x_t = x + "_cast_to_INT32";
scope.FindVar(x + "_cast_to_INT32")->GetMutable<phi::DenseTensor>(); if (scope.FindVar(x_t) == nullptr) {
const_cast<framework::Scope *>(&scope)->Var(x_t);
}
auto int32_tensor = scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
*int32_tensor = phi::Cast<int64_t>( *int32_tensor = phi::Cast<int64_t>(
reinterpret_cast<const phi::GPUContext &>(dev_ctx), reinterpret_cast<const phi::GPUContext &>(dev_ctx),
t, t,
...@@ -827,8 +793,11 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -827,8 +793,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto y = Outputs("Ys")[i]; auto y = Outputs("Ys")[i];
auto *fluid_v = scope.FindVar(y); auto *fluid_v = scope.FindVar(y);
auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>(); auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
auto int32_tensor = std::string y_t = y + "_cast_to_INT64";
scope.FindVar(y + "_cast_to_INT64")->GetMutable<phi::DenseTensor>(); if (scope.FindVar(y_t) == nullptr) {
const_cast<framework::Scope *>(&scope)->Var(y_t);
}
auto int32_tensor = scope.FindVar(y_t)->GetMutable<phi::DenseTensor>();
int32_tensor->Resize(fluid_t->dims()); int32_tensor->Resize(fluid_t->dims());
dev_ctx.Alloc<int32_t>(int32_tensor); dev_ctx.Alloc<int32_t>(int32_tensor);
framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor); framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor);
...@@ -840,8 +809,11 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -840,8 +809,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto y = Outputs("Ys")[i]; auto y = Outputs("Ys")[i];
auto *fluid_v = scope.FindVar(y); auto *fluid_v = scope.FindVar(y);
auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>(); auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
auto fp32_tensor = std::string y_t = y + "_cast_to_FP64";
scope.FindVar(y + "_cast_to_FP64")->GetMutable<phi::DenseTensor>(); if (scope.FindVar(y_t) == nullptr) {
const_cast<framework::Scope *>(&scope)->Var(y_t);
}
auto fp32_tensor = scope.FindVar(y_t)->GetMutable<phi::DenseTensor>();
fp32_tensor->Resize(fluid_t->dims()); fp32_tensor->Resize(fluid_t->dims());
dev_ctx.Alloc<float>(fp32_tensor); dev_ctx.Alloc<float>(fp32_tensor);
framework::TensorCopy(*fluid_t, dev_place, dev_ctx, fp32_tensor); framework::TensorCopy(*fluid_t, dev_place, dev_ctx, fp32_tensor);
...@@ -856,20 +828,92 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -856,20 +828,92 @@ class TensorRTEngineOp : public framework::OperatorBase {
TensorRTEngine *GetEngine(const framework::Scope &scope, TensorRTEngine *GetEngine(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
if (!trt_engine_) { if (!trt_engine_) {
TensorRTEngine::ConstructionParams params;
params.max_batch_size = max_batch_size_;
params.max_workspace_size = workspace_size_;
params.precision = precision_mode_;
params.calibrator = calibrator_.get();
params.device_id = dev_place.device;
params.with_dynamic_shape = with_dynamic_shape_;
params.context_memory_sharing = Attr<bool>("context_memory_sharing");
params.use_dla = Attr<bool>("use_dla");
params.dla_core = Attr<int>("dla_core");
params.disable_trt_plugin_fp16 = Attr<bool>("disable_trt_plugin_fp16");
params.enable_low_precision_io = Attr<bool>("enable_low_precision_io");
params.use_inspector = Attr<bool>("use_inspector");
if (!shape_range_info_path_.empty()) {
inference::DeserializeShapeRangeInfo(shape_range_info_path_,
&params.min_input_shape,
&params.max_input_shape,
&params.optim_input_shape,
&params.min_shape_tensor,
&params.max_shape_tensor,
&params.optim_shape_tensor);
} else {
if (HasAttr("dynamic_shape_names") &&
HasAttr("min_input_shape_vector") &&
HasAttr("max_input_shape_vector") &&
HasAttr("opt_input_shape_vector")) {
std::vector<std::string> dynamic_shape_names;
std::vector<std::vector<int>> min_input_shapes;
std::vector<std::vector<int>> max_input_shapes;
std::vector<std::vector<int>> opt_input_shapes;
std::vector<int> dynamic_shape_lens;
dynamic_shape_names =
Attr<std::vector<std::string>>("dynamic_shape_names");
std::vector<int> min_shapes =
Attr<std::vector<int>>("min_input_shape_vector");
std::vector<int> max_shapes =
Attr<std::vector<int>>("max_input_shape_vector");
std::vector<int> opt_shapes =
Attr<std::vector<int>>("opt_input_shape_vector");
dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
int idx = 0;
for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
std::vector<int> tmp1, tmp2, tmp3;
for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
tmp1.push_back(min_shapes[idx]);
tmp2.push_back(max_shapes[idx]);
tmp3.push_back(opt_shapes[idx++]);
}
min_input_shapes.emplace_back(tmp1);
max_input_shapes.emplace_back(tmp2);
opt_input_shapes.emplace_back(tmp3);
}
for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
params.min_input_shape.insert(
std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
params.max_input_shape.insert(
std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
params.optim_input_shape.insert(
std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
}
}
}
trt_engine_ = trt_engine_ =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key_ + std::to_string(predictor_id_), .Create(engine_key_ + std::to_string(predictor_id_), params);
max_batch_size_,
workspace_size_, if (use_static_engine_) {
precision_mode_, LOG(INFO) << "Load TRT Optimized Info from "
calibrator_.get(), << inference::analysis::GetTrtEngineSerializedPath(
device_id_, model_opt_cache_dir_, engine_key_);
with_dynamic_shape_, std::string trt_engine_serialized_data =
min_input_shape_, inference::analysis::GetTrtEngineSerializedData(
max_input_shape_, model_opt_cache_dir_, engine_key_);
opt_input_shape_); trt_engine_->Deserialize(trt_engine_serialized_data);
} else {
// This brach mainly used to ut.
PrepareTRTEngine(scope, trt_engine_); PrepareTRTEngine(scope, trt_engine_);
} }
}
PADDLE_ENFORCE_NOT_NULL(
trt_engine_,
platform::errors::Fatal(
"The pointer to tensorrt engine should not be null."));
return trt_engine_; return trt_engine_;
} }
}; };
......
...@@ -113,7 +113,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { ...@@ -113,7 +113,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
ASSERT_EQ(block_->ops_size(), 2); ASSERT_EQ(block_->ops_size(), 2);
LOG(INFO) << "create tensorrt desc"; LOG(INFO) << "create tensorrt op desc";
framework::OpDesc engine_op_desc(nullptr); framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
...@@ -138,19 +138,18 @@ void DynamicShapeTest(bool allow_build_at_runtime) { ...@@ -138,19 +138,18 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string("")); engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0; int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id); engine_op_desc.SetAttr("gpu_device_id", device_id);
engine_op_desc.SetAttr("shape_range_info_path", std::string("")); engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
engine_op_desc.SetAttr("model_opt_cache_dir", std::string("")); engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime); engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime);
engine_op_desc.SetAttr("use_static_engine", true); engine_op_desc.SetAttr("use_static_engine", false);
engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"}); engine_op_desc.SetAttr("with_dynamic_shape", false);
engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4}); engine_op_desc.SetAttr("context_memory_sharing", true);
engine_op_desc.SetAttr("with_dynamic_shape", true); engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 1, 1, 1}); engine_op_desc.SetAttr("enable_low_precision_io", false);
engine_op_desc.SetAttr("max_input_shape", std::vector<int>{16, 16, 16, 16}); engine_op_desc.SetAttr("use_inspector", false);
engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 4, 4}); engine_op_desc.SetAttr("use_dla", false);
engine_op_desc.SetAttr("model_precision", engine_op_desc.SetAttr("dla_core", 0);
static_cast<int>(phi::DataType::FLOAT32));
LOG(INFO) << "create engine op"; LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
...@@ -263,7 +262,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -263,7 +262,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
{output_dim, output_dim}, {output_dim, output_dim},
{batch_size, output_dim}); {batch_size, output_dim});
LOG(INFO) << "create tensorrt desc"; LOG(INFO) << "create tensorrt op desc";
framework::OpDesc engine_op_desc(nullptr); framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
...@@ -288,11 +287,18 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -288,11 +287,18 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string("")); engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0; int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id); engine_op_desc.SetAttr("gpu_device_id", device_id);
engine_op_desc.SetAttr("shape_range_info_path", std::string("")); engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
engine_op_desc.SetAttr("model_opt_cache_dir", std::string("")); engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
engine_op_desc.SetAttr("allow_build_at_runtime", false); engine_op_desc.SetAttr("allow_build_at_runtime", false);
engine_op_desc.SetAttr("use_static_engine", false); engine_op_desc.SetAttr("use_static_engine", false);
engine_op_desc.SetAttr("with_dynamic_shape", false);
engine_op_desc.SetAttr("context_memory_sharing", true);
engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
engine_op_desc.SetAttr("enable_low_precision_io", false);
engine_op_desc.SetAttr("use_inspector", false);
engine_op_desc.SetAttr("use_dla", false);
engine_op_desc.SetAttr("dla_core", 0);
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......
...@@ -174,6 +174,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -174,6 +174,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60)
set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300) set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300)
set_tests_properties(test_save_optimized_model_pass PROPERTIES TIMEOUT 300)
if(WITH_NV_JETSON) if(WITH_NV_JETSON)
set_tests_properties( set_tests_properties(
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import unittest
import numpy as np
import paddle
from paddle.inference import Config, PrecisionType, create_predictor
from paddle.jit import to_static
from paddle.static import InputSpec
from paddle.vision.models import alexnet
class TestSaveOptimizedModelPass:
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
net = alexnet(True)
model = to_static(
net, input_spec=[InputSpec(shape=[None, 3, 224, 224], name='x')]
)
paddle.jit.save(
model, os.path.join(self.temp_dir.name, 'alexnet/inference')
)
def tearDown(self):
self.temp_dir.cleanup()
def get_baseline(self):
predictor = self.init_predictor(save_optimized_model=True)
inputs = [
paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32))
]
outputs = predictor.run(inputs)
return outputs[0]
def get_test_output(self):
predictor = self.init_predictor(save_optimized_model=False)
inputs = [
paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32))
]
outputs = predictor.run(inputs)
return outputs[0]
def test_output(self):
if paddle.is_compiled_with_cuda():
baseline = self.get_baseline()
test_output = self.get_test_output()
np.testing.assert_allclose(
baseline.numpy().flatten(),
test_output.numpy().flatten(),
)
class TestSaveOptimizedModelPassWithGPU(
TestSaveOptimizedModelPass, unittest.TestCase
):
def init_predictor(self, save_optimized_model: bool):
if save_optimized_model is True:
config = Config(
os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'),
os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'),
)
config.enable_use_gpu(256, 0, PrecisionType.Half)
config.enable_memory_optim()
config.switch_ir_optim(True)
config.set_optim_cache_dir(
os.path.join(self.temp_dir.name, 'alexnet')
)
config.enable_save_optim_model(True)
else:
config = Config(
os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'),
os.path.join(
self.temp_dir.name, 'alexnet/_optimized.pdiparams'
),
)
config.enable_use_gpu(256, 0, PrecisionType.Half)
config.enable_memory_optim()
config.switch_ir_optim(False)
predictor = create_predictor(config)
return predictor
class TestSaveOptimizedModelPassWithTRT(
TestSaveOptimizedModelPass, unittest.TestCase
):
def init_predictor(self, save_optimized_model: bool):
if save_optimized_model is True:
config = Config(
os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'),
os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'),
)
config.enable_use_gpu(256, 0)
config.enable_tensorrt_engine(
workspace_size=1 << 30,
max_batch_size=1,
min_subgraph_size=3,
precision_mode=PrecisionType.Half,
use_static=True,
use_calib_mode=False,
)
config.set_trt_dynamic_shape_info(
{"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
{"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
{"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
)
config.exp_disable_tensorrt_ops(["flatten_contiguous_range"])
config.enable_memory_optim()
config.switch_ir_optim(True)
config.set_optim_cache_dir(
os.path.join(self.temp_dir.name, 'alexnet')
)
config.enable_save_optim_model(True)
else:
config = Config(
os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'),
os.path.join(
self.temp_dir.name, 'alexnet/_optimized.pdiparams'
),
)
config.enable_use_gpu(256, 0)
config.enable_tensorrt_engine(
workspace_size=1 << 30,
max_batch_size=1,
min_subgraph_size=3,
precision_mode=PrecisionType.Half,
use_static=True,
use_calib_mode=False,
)
config.enable_memory_optim()
config.switch_ir_optim(False)
predictor = create_predictor(config)
return predictor
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册