未验证 提交 69793a27 编写于 作者: L Leo Chen 提交者: GitHub

Add TensorRT inspector into Paddle-TRT (#38362)

上级 575fa0fe
......@@ -83,3 +83,4 @@
| jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) |
| mingxu1067 | Ming Huang (NVIDIA) |
| zlsh80826 | Reese Wang (NVIDIA) |
| leo0519 | Leo Chen (NVIDIA) |
......@@ -219,6 +219,7 @@ struct Argument {
bool);
DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
TensorRtAllowBuildAtRuntime, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
......
......@@ -156,6 +156,7 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
// tuned trt dynamic_shape
pass->Set("trt_shape_range_info_path",
......
......@@ -265,6 +265,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("parameters", params);
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
// we record all inputs' shapes in attr to check if they are consistent
// with the real inputs' shapes retrieved from scope when trt runs.
......@@ -375,6 +376,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
trt_engine->SetDLACore(Get<int>("trt_dla_core"));
trt_engine->SetUseInspector(Get<bool>("use_inspector"));
trt_engine->SetWithErnie(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
......
......@@ -194,6 +194,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_allow_build_at_runtime_);
CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_);
CP_MEMBER(trt_use_inspector_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
......@@ -427,6 +428,8 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) {
trt_dla_core_ = dla_core;
}
void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; }
void AnalysisConfig::Exp_DisableTensorRtOPs(
const std::vector<std::string> &ops) {
trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
......
......@@ -615,6 +615,7 @@ void AnalysisPredictor::PrepareArgument() {
config_.tuned_tensorrt_dynamic_shape());
argument_.SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime());
argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
}
if (config_.dlnne_enabled()) {
......
......@@ -521,6 +521,9 @@ struct PD_INFER_DECL AnalysisConfig {
///
bool tensorrt_dla_enabled() { return trt_use_dla_; }
void EnableTensorRtInspector();
bool tensorrt_inspector_enabled() { return trt_use_inspector_; }
void EnableDlnne(int min_subgraph_size = 3);
bool dlnne_enabled() const { return use_dlnne_; }
......@@ -807,6 +810,7 @@ struct PD_INFER_DECL AnalysisConfig {
bool trt_allow_build_at_runtime_{false};
// tune to get dynamic_shape info.
bool trt_tuned_dynamic_shape_{false};
bool trt_use_inspector_{false};
// In CollectShapeInfo mode, we will collect the shape information of
// all intermediate tensors in the compute graph and calculate the
......
......@@ -57,7 +57,6 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
} else {
#if IS_TRT_VERSION_GE(6000)
infer_context->enqueueV2(buffers->data(), stream, nullptr);
GetEngineInfo();
#endif
}
SetRuntimeBatch(batch_size);
......@@ -244,8 +243,10 @@ void TensorRTEngine::FreezeNetwork() {
#endif
}
#if IS_TRT_VERSION_GE(8200)
infer_builder_config_->setProfilingVerbosity(
nvinfer1::ProfilingVerbosity::kDETAILED);
if (use_inspector_) {
infer_builder_config_->setProfilingVerbosity(
nvinfer1::ProfilingVerbosity::kDETAILED);
}
#endif
#if IS_TRT_VERSION_LT(8000)
......@@ -411,6 +412,21 @@ void TensorRTEngine::freshDeviceId() {
platform::SetDeviceId(device_id_);
}
void TensorRTEngine::GetEngineInfo() {
#if IS_TRT_VERSION_GE(8200)
LOG(INFO) << "====== engine info ======";
std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
infer_engine_->createEngineInspector());
auto infer_context = context();
infer_inspector->setExecutionContext(infer_context);
LOG(INFO) << infer_inspector->getEngineInformation(
nvinfer1::LayerInformationFormat::kONELINE);
LOG(INFO) << "====== engine info end ======";
#else
LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
#endif
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -580,17 +580,10 @@ class TensorRTEngine {
}
void SetProfileNum(int num) { max_profile_num_ = num; }
void GetEngineInfo() {
#if IS_TRT_VERSION_GE(8200)
std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
infer_engine_->createEngineInspector());
infer_inspector->setExecutionContext(context());
VLOG(3) << infer_inspector->getEngineInformation(
nvinfer1::LayerInformationFormat::kJSON);
#else
VLOG(3) << "Inspector needs TensorRT version 8.2 and after.";
#endif
}
void GetEngineInfo();
void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
......@@ -664,6 +657,7 @@ class TensorRTEngine {
std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
#endif
std::mutex mutex_;
bool use_inspector_;
}; // class TensorRTEngine
// Add a layer__ into engine__ with args ARGS.
......
......@@ -140,6 +140,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool enable_int8_;
bool enable_fp16_;
bool use_calib_mode_;
bool use_inspector_;
std::string calibration_data_;
std::string engine_key_;
std::string calibration_engine_key_;
......@@ -175,6 +176,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
shape_range_info_path_ = Attr<std::string>("shape_range_info_path");
allow_build_at_runtime_ = Attr<bool>("allow_build_at_runtime");
use_static_engine_ = Attr<bool>("use_static_engine");
use_inspector_ = HasAttr("use_inspector") && Attr<bool>("use_inspector");
if (use_static_engine_) {
model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
}
......@@ -285,6 +287,9 @@ class TensorRTEngineOp : public framework::OperatorBase {
return;
}
auto *trt_engine = GetEngine(scope, dev_place);
if (use_inspector_) {
trt_engine->GetEngineInfo();
}
if (trt_engine->with_dynamic_shape()) {
// get runtime input shapes.
std::map<std::string, std::vector<int32_t>> runtime_input_shape;
......@@ -331,7 +336,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
anc = &scope;
}
PrepareTRTEngine(*anc, trt_engine);
// update shape_range_info_pbtxt
if (!shape_range_info_path_.empty()) {
inference::UpdateShapeRangeInfo(
......
......@@ -615,6 +615,10 @@ void BindAnalysisConfig(py::module *m) {
.def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
py::arg("dla_core") = 0)
.def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
.def("enable_tensorrt_inspector",
&AnalysisConfig::EnableTensorRtInspector)
.def("tensorrt_inspector_enabled",
&AnalysisConfig::tensorrt_inspector_enabled)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_dlnne", &AnalysisConfig::EnableDlnne,
py::arg("min_subgraph_size") = 3)
......
......@@ -75,6 +75,7 @@ set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
#set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
if(WITH_NV_JETSON)
set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
......
......@@ -122,6 +122,11 @@ class InferencePassTest(unittest.TestCase):
self.trt_parameters.precision,
self.trt_parameters.use_static,
self.trt_parameters.use_calib_mode)
if self.trt_parameters.use_inspector:
config.enable_tensorrt_inspector()
self.assertTrue(
config.tensorrt_inspector_enabled(),
"The inspector option is not set correctly.")
if self.dynamic_shape_params:
config.set_trt_dynamic_shape_info(
......@@ -244,14 +249,21 @@ class InferencePassTest(unittest.TestCase):
Prepare TensorRT subgraph engine parameters.
'''
def __init__(self, workspace_size, max_batch_size, min_subgraph_size,
precision, use_static, use_calib_mode):
def __init__(self,
workspace_size,
max_batch_size,
min_subgraph_size,
precision,
use_static,
use_calib_mode,
use_inspector=False):
self.workspace_size = workspace_size
self.max_batch_size = max_batch_size
self.min_subgraph_size = min_subgraph_size
self.precision = precision
self.use_static = use_static
self.use_calib_mode = use_calib_mode
self.use_inspector = use_inspector
class DynamicShapeParam:
'''
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import threading
import time
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import PassVersionChecker
from paddle.fluid.core import AnalysisConfig
import subprocess
class TensorRTInspectorTest(InferencePassTest):
def setUp(self):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32")
matmul_out = fluid.layers.matmul(
x=data,
y=data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = {"data": np.ones([1, 16, 16]).astype("float32"), }
self.enable_trt = True
self.trt_parameters = InferencePassTest.TensorRTParam(
1 << 30, 1, 0, AnalysisConfig.Precision.Float32, False, False, True)
self.fetch_list = [out]
def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 2.0
def test_check_output(self):
if core.is_compiled_with_cuda():
build_engine = subprocess.run(
[sys.executable, 'test_trt_inspector.py', '--build-engine'],
stderr=subprocess.PIPE)
engine_info = build_engine.stderr.decode('ascii')
trt_compile_version = paddle.inference.get_trt_compile_version()
trt_runtime_version = paddle.inference.get_trt_runtime_version()
valid_version = (8, 2, 0)
if trt_compile_version >= valid_version and trt_runtime_version >= valid_version:
self.assertTrue('====== engine info ======' in engine_info)
self.assertTrue('====== engine info end ======' in engine_info)
self.assertTrue('matmul' in engine_info)
self.assertTrue('LayerType: Scale' in engine_info)
self.assertTrue('batch_norm' in engine_info)
else:
self.assertTrue(
'Inspector needs TensorRT version 8.2 and after.' in
engine_info)
if __name__ == "__main__":
if '--build-engine' in sys.argv:
test = TensorRTInspectorTest()
test.setUp()
use_gpu = True
test.check_output_with_option(use_gpu)
else:
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册