From 61221ebc28a9cc6f953715b54838b810e06f8df9 Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Sat, 25 May 2019 12:17:54 +0800 Subject: [PATCH] TRT: Support set dynamic range in int8 mode. (#17524) * fluid int8 train and trt int8 predict align. trt int8 predict init op converter * 2. align fluid int8 train and trt int8 inference. enhance quant dequant fuse pass enhance op converter, trt engine, trt engine op, trt subgraph pass. * 3. add delete_quant_dequant_pass for trt test=develop * 4. add the missing file test=develop * 5. i modify the c++ interface, but forget to modify the pybind code fix the IS_TRT_VERSION_GE bug, and fix elementwise op converter test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/delete_quant_dequant_op_pass.cc | 82 ++++++++++++++++++ .../ir/delete_quant_dequant_op_pass.h | 34 ++++++++ paddle/fluid/framework/ir/fc_fuse_pass.cc | 5 ++ .../framework/ir/graph_pattern_detector.cc | 85 ++++++++++++++++--- .../framework/ir/graph_pattern_detector.h | 17 +++- .../ir/quant_conv2d_dequant_fuse_pass.cc | 73 ++++++++++------ .../fluid/inference/anakin/convert/conv2d.cc | 7 +- .../inference/anakin/convert/conv2d_fusion.cc | 7 +- paddle/fluid/inference/anakin/convert/fc.cc | 7 +- paddle/fluid/inference/analysis/argument.h | 1 + .../inference/analysis/ir_pass_manager.cc | 2 + .../analysis/ir_passes/subgraph_util.cc | 4 +- .../analysis/ir_passes/subgraph_util.h | 2 +- .../ir_passes/tensorrt_subgraph_pass.cc | 11 ++- paddle/fluid/inference/api/analysis_config.cc | 15 ++-- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/api/paddle_analysis_config.h | 4 +- .../inference/api/paddle_pass_builder.cc | 18 ++++ .../fluid/inference/api/paddle_pass_builder.h | 1 + .../tensorrt/convert/activation_op.cc | 13 +-- .../tensorrt/convert/batch_norm_op.cc | 8 +- .../inference/tensorrt/convert/concat_op.cc | 8 +- .../inference/tensorrt/convert/conv2d_op.cc | 47 ++++++---- .../inference/tensorrt/convert/dropout_op.cc | 7 +- .../tensorrt/convert/elementwise_op.cc | 55 ++++++------ .../fluid/inference/tensorrt/convert/fc_op.cc | 71 +++++++++++----- .../tensorrt/convert/leaky_relu_op.cc | 10 +-- .../inference/tensorrt/convert/op_converter.h | 15 ++++ .../inference/tensorrt/convert/pad_op.cc | 8 +- .../inference/tensorrt/convert/pool2d_op.cc | 12 +-- .../inference/tensorrt/convert/prelu_op.cc | 9 +- .../inference/tensorrt/convert/softmax_op.cc | 10 ++- .../inference/tensorrt/convert/ut_helper.h | 3 +- paddle/fluid/inference/tensorrt/engine.cc | 79 ++++++++++++++++- paddle/fluid/inference/tensorrt/engine.h | 14 +++ paddle/fluid/inference/tensorrt/op_teller.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.h | 2 + .../operators/tensorrt/tensorrt_engine_op.h | 5 +- .../tensorrt/tensorrt_engine_op_test.cc | 2 + paddle/fluid/pybind/inference_api.cc | 2 +- 41 files changed, 563 insertions(+), 196 deletions(-) create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 3210f3041a..bfba73c289 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -75,6 +75,7 @@ pass_library(runtime_context_cache_pass base) pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(fillconstant_elementwisemul_fuse inference) pass_library(shuffle_channel_detect_pass inference) +pass_library(delete_quant_dequant_op_pass inference) if(ANAKIN_FOUND) pass_library(simplify_anakin_priorbox_detection_out_pass inference) diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc new file mode 100644 index 0000000000..3d4df87ab7 --- /dev/null +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(any_op_out); \ + GET_IR_NODE(quant_dequant_op_inscale); \ + GET_IR_NODE(quant_dequant_op); \ + GET_IR_NODE(quant_dequant_op_outscale); \ + GET_IR_NODE(quant_dequant_op_out); \ + GET_IR_NODE(any_op2); + +void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "delete_quantdequant_op_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(), + pattern_name); + pattern(); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + IR_NODE_LINK_TO(any_op_out, any_op2); + std::string any_op_out_name = any_op_out->Var()->Name(); + std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name(); + + auto* any_op2_desc = any_op2->Op(); + // auto input_args_names = any_op2_desc->InputArgumentNames(); + auto var_map = any_op2_desc->Inputs(); + + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + quant_dequant_op_out_name) != name_m.second.end()) { + std::vector new_inputs; + for (auto& i_n : name_m.second) { + if (i_n != quant_dequant_op_out_name) { + new_inputs.push_back(i_n); + } + } + new_inputs.push_back(any_op_out_name); + any_op2_desc->SetInput(name_m.first, new_inputs); + any_op2_desc->Flush(); + } + } + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {quant_dequant_op, quant_dequant_op_out, + quant_dequant_op_inscale, quant_dequant_op_outscale}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(delete_quant_dequant_op_pass, + paddle::framework::ir::DeleteQuantDequantOpPass); diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h new file mode 100644 index 0000000000..938ada6453 --- /dev/null +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class DeleteQuantDequantOpPass : public FusePassBase { + public: + virtual ~DeleteQuantDequantOpPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 4691b9abfd..102fd38865 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -78,6 +78,11 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8")); desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale")); desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale")); + if (base_op_desc->HasAttr("out_scale")) + desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale")); + auto elementwise_desc = elementwise_add->Op(); + if (elementwise_desc->HasAttr("out_scale")) + desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale")); } desc.SetType("fc"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f0d47ad57f..d50ca63603 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1738,13 +1738,16 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, const std::string &op_type, const std::string &weight_name, int times, - const std::string &quant_type) { - const int kNumFields = 5; + const std::string &quant_type, + const std::string &dequant_type) { + int kNumFields = 5; const int kQuantizedWeightOffset = 0; const int kQuantizedOpOffset = 1; const int kQuantizedOpOutOffset = 2; const int kDequantOpOffset = 3; const int kDequantOpOutOffset = 4; + const int kDequantOpWeightScaleOffset = 5; + // the quant op always be one. auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale")) ->assert_is_op_input(quant_type, "InScale") @@ -1752,11 +1755,19 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, auto quant_op = pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type); - auto quant_op_out_scale = - pattern->NewNode(GetNodeName("quant_op_out_scale")) - ->assert_is_op_output(quant_type, "OutScale") - ->assert_is_op_input("fake_dequantize_max_abs", "Scale") - ->AsIntermediate(); + PDNode *quant_op_out_scale = nullptr; + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + kNumFields += 1; + quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) + ->assert_is_op_output(quant_type, "OutScale") + ->assert_is_op_nth_input(dequant_type, "Scales", 1) + ->AsIntermediate(); + } else { + quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) + ->assert_is_op_output(quant_type, "OutScale") + ->assert_is_op_input(dequant_type, "Scale") + ->AsIntermediate(); + } auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out")) ->assert_is_op_output(quant_type, "Out") @@ -1777,16 +1788,25 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, nodes.push_back( pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i)) ->assert_is_op_output(op_type) - ->assert_is_op_input("fake_dequantize_max_abs", "X") + ->assert_is_op_input(dequant_type, "X") ->AsIntermediate()); nodes.push_back( pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i)) - ->assert_is_op("fake_dequantize_max_abs")); + ->assert_is_op(dequant_type)); + nodes.push_back( pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i)) - ->assert_is_op_output("fake_dequantize_max_abs", "Out") + ->assert_is_op_output(dequant_type, "Out") ->AsOutput()); + + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes.push_back(pattern + ->NewNode(GetNodeName("dequant_channel_scale") + + std::to_string(i)) + ->assert_is_op_nth_input(dequant_type, "Scales", 0) + ->AsInput()); + } } quant_op->LinksFrom({quant_op_input, quant_op_in_scale}); @@ -1796,8 +1816,14 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]}); nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom( {nodes[i * kNumFields + kQuantizedOpOffset]}); - nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( - {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale, + nodes[i * kNumFields + kDequantOpWeightScaleOffset]}); + } else { + nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); + } nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom( {nodes[i * kNumFields + kDequantOpOffset]}); } @@ -1834,6 +1860,41 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) { reshape2_out->LinksFrom({reshape2_op}); } +void patterns::DeleteQuantDequantOpPattern::operator()() { + auto any_op_out = + pattern->NewNode(any_op_out_repr()) + ->assert_is_op_input( + "fake_quantize_dequantize_moving_average_abs_max", "X") + ->AsInput(); + + auto quant_dequant_op_inscale = + pattern->NewNode(quant_dequant_op_inscale_repr()) + ->assert_is_op_input( + "fake_quantize_dequantize_moving_average_abs_max", "InScale") + ->AsInput(); + auto quant_dequant_op = + pattern->NewNode(quant_dequant_op_repr()) + ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max"); + + auto quant_dequant_out = + pattern->NewNode(quant_dequant_op_out_repr()) + ->assert_is_op_output( + "fake_quantize_dequantize_moving_average_abs_max", "Out") + ->AsIntermediate(); + + auto quant_dequant_op_outscale = + pattern->NewNode(quant_dequant_op_outscale_repr()) + ->assert_is_op_output( + "fake_quantize_dequantize_moving_average_abs_max", "OutScale") + ->AsOutput(); + auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput(); + + quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale}); + quant_dequant_op_outscale->LinksFrom({quant_dequant_op}); + quant_dequant_out->LinksFrom({quant_dequant_op}); + any_op2->LinksFrom({quant_dequant_out}); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 7df2f5efc4..41f9d12858 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -954,7 +954,8 @@ struct QuantDequantOpFuse : public PatternBase { void operator()(PDNode* quant_op_input, const std::string& op_name, const std::string& weight_name, int times, - const std::string& quant_type); + const std::string& quant_type, + const std::string& dequant_type); std::string GetNodeName(const std::string& op_type) { return PDNodeName(name_scope_, repr_, id_, op_type); @@ -980,6 +981,20 @@ struct ShuffleChannelPattern : public PatternBase { PATTERN_DECL_NODE(reshape2_out); }; +struct DeleteQuantDequantOpPattern : public PatternBase { + DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {} + + void operator()(); + + PATTERN_DECL_NODE(any_op_out); + PATTERN_DECL_NODE(quant_dequant_op_inscale); + PATTERN_DECL_NODE(quant_dequant_op); + PATTERN_DECL_NODE(quant_dequant_op_outscale); + PATTERN_DECL_NODE(quant_dequant_op_out); + PATTERN_DECL_NODE(any_op2); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 017e3ef234..62fba440ed 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -25,16 +25,20 @@ namespace framework { namespace ir { void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, - const std::string& op_type, - const std::string& quant_type) { + const std::string& op_type, const std::string& quant_type, + const std::string& dequant_type) { const std::string pattern_name = "quant_dequant_fuse"; - // FusePassBase::Init(pattern_name, graph); - const int kNumFields = 5; + int kNumFields = 5; const int kQuantizedWeightOffset = 0; const int kQuantizedOpOffset = 1; const int kQuantizedOpOutOffset = 2; const int kDequantOpOffset = 3; const int kDequantOpOutOffset = 4; + const int kDequantOpWeightScaleOffset = 5; + + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + kNumFields += 1; + } GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -42,22 +46,14 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ->assert_is_op_input(quant_type, "X") ->AsInput(); - std::string quantized_op_type = ""; + std::string quantized_op_type = op_type; std::string weight_name = ""; - if (op_type == "conv2d") { - quantized_op_type = "conv2d"; - weight_name = "Filter"; - } else if (op_type == "depthwise_conv2d") { - quantized_op_type = "depthwise_conv2d"; - weight_name = "Filter"; - } else if (op_type == "conv2d_fusion") { - quantized_op_type = "conv2d_fusion"; + if (op_type == "conv2d" || op_type == "depthwise_conv2d" || + op_type == "conv2d_fusion") { weight_name = "Filter"; } else if (op_type == "mul") { - quantized_op_type = "mul"; weight_name = "Y"; } else if (op_type == "fc") { - quantized_op_type = "fc"; weight_name = "W"; } else { PADDLE_ENFORCE( @@ -66,7 +62,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, } patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name); - pattern(x, quantized_op_type, weight_name, times, quant_type); + pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -91,6 +87,10 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i)))); nodes.push_back( subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i)))); + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes.push_back(subgraph.at( + pattern.GetPDNode("dequant_channel_scale" + std::to_string(i)))); + } } int bit_length = boost::get(quant_op->Op()->GetAttr("bit_length")); @@ -107,10 +107,31 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, std::unordered_set delete_nodes; for (int i = 0; i < times; i++) { - float max_range = boost::get( - nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range")); - float weight_scale = (range * range) / max_range; + std::vector weight_scale; + + // Get weight scale from dequant op. + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + auto scales_name = + nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales"); + PADDLE_ENFORCE(scales_name.size() == 2); + const LoDTensor& channel_scale_tensor = + scope->FindVar(scales_name[0])->Get(); + PADDLE_ENFORCE( + paddle::platform::is_cpu_place(channel_scale_tensor.place())); + const float* channel_scale_data = channel_scale_tensor.data(); + for (int i = 0; i < channel_scale_tensor.numel(); i++) { + weight_scale.push_back(channel_scale_data[i]); + } + delete_nodes.insert( + nodes[i * kNumFields + kDequantOpWeightScaleOffset]); + } else { + float max_range = boost::get( + nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr( + "max_range")); + weight_scale.push_back((range * range) / max_range); + } + // create new op_desc auto base_op_desc = *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto(); std::string new_input = input_node->Name(); @@ -141,6 +162,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, IR_NODE_LINK_TO(input_node, new_op); IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op); IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]); delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]); delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]); @@ -160,16 +182,19 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "quant_dequant_fuse"; FusePassBase::Init(pattern_name, graph); + std::unordered_set dequant_types = { + "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"}; std::unordered_set quant_types = { "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; - std::unordered_set quantized_op_types = {"conv2d", "mul", "depthwise_conv2d"}; auto* scope = param_scope(); - for (auto& quant_type : quant_types) { - for (auto& op_type : quantized_op_types) { - for (int i = 6; i >= 1; i--) { - RunQuantDequant(graph, scope, i, op_type, quant_type); + for (auto& dequant_type : dequant_types) { + for (auto& quant_type : quant_types) { + for (auto& op_type : quantized_op_types) { + for (int i = 6; i >= 1; i--) { + RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type); + } } } } diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc index 70e0adf5ea..26f78efa61 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -70,7 +70,8 @@ void Conv2dOpConverter::operator()( if (enable_int8) { const float int8_range = 127.; float in_scale = boost::get(op_desc.GetAttr("input_scale")); - float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); PBlock *weight1 = new PBlock(anakin_shape, ::anakin::AK_INT8); this->engine_->RegistBlock(weight1); @@ -91,8 +92,8 @@ void Conv2dOpConverter::operator()( weight1->d_tensor().copy_from(weight1->h_tensor()); this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); - this->engine_->Graph()->SetWeightsScale(op_name, - {weight_scale / int8_range}, false); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); this->engine_->AddTensorScale(input_name, in_scale / int8_range); } else { auto *weight1 = pblock_from_tensor( diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc index a1568b8bde..f2e6003aa6 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -72,7 +72,8 @@ void Conv2dFusionOpConverter::operator()( if (enable_int8) { const float int8_range = 127.; float in_scale = boost::get(op_desc.GetAttr("input_scale")); - float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); PBlock *weight1 = new PBlock(anakin_shape, ::anakin::AK_INT8); this->engine_->RegistBlock(weight1); @@ -93,8 +94,8 @@ void Conv2dFusionOpConverter::operator()( weight1->d_tensor().copy_from(weight1->h_tensor()); this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); - this->engine_->Graph()->SetWeightsScale(op_name, - {weight_scale / int8_range}, false); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); this->engine_->AddTensorScale(input_name, in_scale / int8_range); } else { auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index 0621e3377b..b64d0b84fd 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -76,7 +76,8 @@ void FcBaseOpConverter::operator()( ::anakin::saber::Shape anakin_shape(weight_shape); const float int8_range = 127.; float in_scale = boost::get(op_desc.GetAttr("input_scale")); - float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); PBlock *weight1 = new PBlock(anakin_shape, ::anakin::AK_INT8); this->engine_->RegistBlock(weight1); @@ -95,8 +96,8 @@ void FcBaseOpConverter::operator()( weight1->d_tensor().copy_from(weight1->h_tensor()); this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); - this->engine_->Graph()->SetWeightsScale(op_name, - {weight_scale / int8_range}, false); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); this->engine_->AddTensorScale(input_name, in_scale / int8_range); } else { auto *weight1 = pblock_from_vector(trans_weight_data, diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 66e8d8b528..590baf4ee3 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -164,6 +164,7 @@ struct Argument { AnalysisConfig::Precision); DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, bool); + DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, anakin_max_shape_t); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 371118ffaf..e22f1cbd2e 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -87,7 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument, bool enable_int8 = argument->tensorrt_precision_mode() == AnalysisConfig::Precision::kInt8; + bool use_calib_mode = argument->tensorrt_use_calib_mode(); pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("use_calib_mode", new bool(use_calib_mode)); bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc index 8f7c6ac755..e16cce54c2 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -61,7 +61,7 @@ void RenameAndGetOutputs( std::set *output_names, std::unordered_map *output_name_map, const std::unordered_map &graph_var_map, - bool is_trt) { + bool trt_and_not_int8) { //// In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv @@ -121,7 +121,7 @@ void RenameAndGetOutputs( for (auto out_var : correspond_node->outputs) { var2id[out_var->Name()] = out_var->id(); } - if (op_desc.Type() == "conv2d" && is_trt) { + if (op_desc.Type() == "conv2d" && trt_and_not_int8) { auto input_var_name = op_desc.Input("Input").front(); auto filter_var_name = op_desc.Input("Filter").front(); auto out_var_name = op_desc.Output("Output").front(); diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h index bb44502782..444e1984cf 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h @@ -43,7 +43,7 @@ void RenameAndGetOutputs( std::set *output_names, std::unordered_map *output_name_map, const std::unordered_map &graph_var_map, - bool is_trt = true); + bool trt_and_not_int8 = false); } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 67650a352d..3fad263b05 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -149,6 +149,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( graph_var_map[node->Name()] = node; } } + auto enable_int8 = Get("enable_int8"); + auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate @@ -165,7 +167,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // it is either an OP's input or an OP's output. RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, &output_names_with_id, &output_names, &output_name_map, - graph_var_map); + graph_var_map, !enable_int8); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -196,7 +198,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "parameters", params); - auto enable_int8 = Get("enable_int8"); auto use_static_engine = Get("use_static_engine"); auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, std::to_string(0)); @@ -204,13 +205,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // Get "" when there is no cached calibration table data. bool load_from_memory = Get("model_from_memory"); std::string calibration_data = ""; - if (enable_int8) { + if (enable_int8 && use_calib_mode) { calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); } SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); SetAttr(op_desc->Proto(), "engine_key", engine_key); std::string trt_engine_serialized_data = ""; SetAttr(op_desc->Proto(), "engine_serialized_data", @@ -222,7 +224,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( } // When in int8 mode and calibration_mode, the program just produce the // calibration table data. - bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + bool calibration_mode = + (enable_int8 && calibration_data.size() == 0 && use_calib_mode); if (calibration_mode) { // calibraion mode means generate int8 calibration table data process. return; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 4fe0c48d8f..67c5d2c0bd 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/platform/gpu_info.h" namespace paddle { +extern const std::vector kTRTSubgraphPasses; extern const std::vector kAnakinSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { @@ -105,6 +106,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); CP_MEMBER(trt_use_static_engine_); + CP_MEMBER(trt_use_calib_mode_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -177,7 +179,8 @@ std::shared_ptr AnalysisConfig::mkldnn_quantizer_config() void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode, bool use_static) { + AnalysisConfig::Precision precision_mode, bool use_static, + bool use_calib_mode) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -190,6 +193,7 @@ void AnalysisConfig::EnableTensorRtEngine( tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; trt_use_static_engine_ = use_static; + trt_use_calib_mode_ = use_calib_mode; Update(); #else @@ -228,13 +232,10 @@ void AnalysisConfig::Update() { } if (use_tensorrt_) { - const auto &passes = pass_builder_->AllPasses(); - if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") == - std::end(passes)) { - // Append after the Affine_channel_conv_fuse pass. - pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); + pass_builder()->ClearPasses(); + for (const auto &pass : kTRTSubgraphPasses) { + pass_builder()->AppendPass(pass); } - pass_builder()->DeletePass("runtime_context_cache_pass"); } if (use_mkldnn_) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index e57d3a8045..ef874646eb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); + argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_); } if (config_.anakin_engine_enabled()) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index ebe289322b..8067cd777d 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -142,7 +142,8 @@ struct AnalysisConfig { void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, Precision precision = Precision::kFloat32, - bool use_static = false); + bool use_static = false, + bool use_calib_mode = false); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -266,6 +267,7 @@ struct AnalysisConfig { int tensorrt_min_subgraph_size_{3}; Precision tensorrt_precision_mode_; bool trt_use_static_engine_; + bool trt_use_calib_mode_; // memory reuse related. bool enable_memory_optim_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 2bad89cdb3..3dc9814d0d 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -70,6 +70,24 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } +const std::vector kTRTSubgraphPasses({ + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "quant_conv2d_dequant_fuse_pass", // + "delete_quant_dequant_op_pass", // + // "fc_fuse_pass", // + "tensorrt_subgraph_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", +}); + // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ "infer_clean_graph_pass", // diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 057e7dc65d..1a3430530f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -142,6 +142,7 @@ class GpuPassStrategy : public PassStrategy { virtual ~GpuPassStrategy() = default; }; +extern const std::vector kTRTSubgraphPasses; extern const std::vector kAnakinSubgraphPasses; } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 0b756534ec..5c2454fa9a 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -43,12 +43,13 @@ class ActivationOpConverter : public OpConverter { engine_, Activation, *const_cast(input_tensor), op_pair->second); auto output_name = op_desc.Output("Out")[0]; - layer->setName((op_type_ + " (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { // the test framework can not determine which is the - // output, so place the declaration inside. - engine_->DeclareOutput(output_name); + + RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index d017bac66d..d948868464 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,18 +116,12 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); - layer->setName(("batch_norm (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Bias").front()] = std::move(combile_bias_tensor); engine_->weight_map[op_desc.Input("Scale").front()] = std::move(combile_scale_tensor); - engine_->SetITensor(output_name, layer->getOutput(0)); - - if (test_mode) { - engine_->DeclareOutput(output_name); - } + RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index 525ba9dc34..ec771850ed 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -42,13 +42,7 @@ class ConcatOpConverter : public OpConverter { axis = axis - 1; // Remove batch dim layer->setAxis(axis); auto output_name = op_desc.Output("Out")[0]; - layer->setName(("concat (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { // the test framework can not determine which is the - // output, so place the declaration inside. - engine_->DeclareOutput(output_name); - } + RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 39a99a21ea..73bfa800f0 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -32,25 +32,31 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, PADDLE_ENFORCE(engine != nullptr); auto* X = engine->GetITensor(op_desc.Input("Input").front()); - - // Declare weights auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); + float* weight_data = nullptr; + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, + true, weight_scale); + engine->SetTensorDynamicRange(X, in_scale); +#endif + } else { + weight_data = + engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false); + } - platform::CPUPlace cpu_place; - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(Y_t->dims()); - TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - - auto* weight_data = weight_tensor->mutable_data(cpu_place); - - PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); - const int n_output = weight_tensor->dims()[0]; - const int n_input = weight_tensor->dims()[1]; - const int filter_h = weight_tensor->dims()[2]; - const int filter_w = weight_tensor->dims()[3]; + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); + const int n_output = Y_t->dims()[0]; + const int n_input = Y_t->dims()[1]; + const int filter_h = Y_t->dims()[2]; + const int filter_w = Y_t->dims()[3]; const int groups = boost::get(op_desc.GetAttr("groups")); const std::vector dilations = boost::get>(op_desc.GetAttr("dilations")); @@ -66,7 +72,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - static_cast(weight_tensor->numel())}; + static_cast(Y_t->numel())}; TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto* layer = fadd_layer(const_cast(X), n_output, n_input, @@ -80,11 +86,16 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, auto output_name = op_desc.Output("Output").front(); layer->setName((name + " (Output: " + output_name + ")").c_str()); - engine->weight_map[op_desc.Input("Filter").front()] = - std::move(weight_tensor); layer->getOutput(0)->setName(output_name.c_str()); engine->SetITensor(output_name, layer->getOutput(0)); +#if IS_TRT_VERSION_GE(5000) + if (enable_int8) { + float output_scale = boost::get(op_desc.GetAttr("out_scale")); + engine->SetTensorDynamicRange(layer->getOutput(0), output_scale); + } +#endif + if (test_mode) { engine->DeclareOutput(output_name); } diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index ddbc724e3b..71177e5e66 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -55,11 +55,8 @@ class DropoutOpConverter : public OpConverter { engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = std::move(weight_tensor); auto output_name = op_desc.Output("Out")[0]; - layer->setName(("dropout (Output: " + output_name + ")").c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { - engine_->DeclareOutput(output_name); - } + + RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 0c5a1a6ef1..a888b0803d 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -55,17 +55,13 @@ class ElementwiseWeightOpConverter : public OpConverter { auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); + float* weight_data = nullptr; + weight_data = + engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false); - platform::CPUPlace cpu_place; - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(Y_t->dims()); - TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - auto* weight_data = - weight_tensor->mutable_data(platform::CPUPlace()); auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; - std::vector dims_y = framework::vectorize2int(weight_tensor->dims()); + std::vector dims_y = framework::vectorize2int(Y_t->dims()); if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); } @@ -92,9 +88,9 @@ class ElementwiseWeightOpConverter : public OpConverter { PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); } - TensorRTEngine::Weight shift_weights{ - nvinfer1::DataType::kFLOAT, static_cast(weight_data), - weight_tensor->memory_size() / sizeof(float)}; + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(Y_t->numel())}; TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, @@ -112,14 +108,13 @@ class ElementwiseWeightOpConverter : public OpConverter { } auto output_name = op_desc.Output("Out")[0]; - layer->setName( - ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { // the test framework can not determine which is the - // output, so place the declaration inside. - engine_->DeclareOutput(output_name); + RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name}, + test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } @@ -138,6 +133,7 @@ class ElementwiseTensorOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); + nvinfer1::ILayer* layer = nullptr; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight @@ -153,13 +149,11 @@ class ElementwiseTensorOpConverter : public OpConverter { if (CheckDims(dims_x, dims_y)) { // The two input tensor should have the same dims VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( + nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *const_cast(X), *const_cast(Y), op_pair->second); - layer->setName(("elementwise (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); + layer = elet_layer; } else { VLOG(3) << "Convert a fluid elementwise op to TensorRT " "ElementWisePluginLayer"; @@ -168,17 +162,18 @@ class ElementwiseTensorOpConverter : public OpConverter { new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin->AddInput(X); plugin->AddInput(Y); - nvinfer1::IPluginLayer* layer = engine_->AddPlugin( + nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin( const_cast(plugin->GetInputs().data()), 2, reinterpret_cast(plugin)); - layer->setName(("elementwise (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); + layer = plugin_layer; } - if (test_mode) { // the test framework can not determine which is the - // output, so place the declaration inside. - engine_->DeclareOutput(output_name); + RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 42dcd68e40..fb7b89b189 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -53,33 +53,47 @@ class FcOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; - framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input_names = op_desc.InputNames(); + bool with_bias = input_names.size() >= 3; + std::string w_name = "Y"; + std::string i_name = "X"; + if (with_bias) { + w_name = "W"; + i_name = "Input"; + } // Declare inputs - auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* X = engine_->GetITensor(op_desc.Input(i_name).front()); // Declare weights - auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); + auto* Y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); // This may trigger a GPU->CPU copy, because TRT's weight can only be // assigned from CPU memory, that can't be avoided. - platform::CPUPlace cpu_place; - framework::LoDTensor weight_tensor; - weight_tensor.Resize(Y_t->dims()); - TensorCopySync((*Y_t), cpu_place, &weight_tensor); - - auto* weight_data = weight_tensor.mutable_data(platform::CPUPlace()); + float* weight_data = nullptr; + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), + Y_t, true, weight_scale); + engine_->SetTensorDynamicRange(X, in_scale); +#endif + } else { + weight_data = + engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false); + } - PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL); // a matrix - size_t n_output = weight_tensor.dims()[1]; + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix + size_t n_output = Y_t->dims()[1]; std::unique_ptr tmp(new framework::LoDTensor()); - tmp->Resize(weight_tensor.dims()); + tmp->Resize(Y_t->dims()); memcpy(tmp->mutable_data(platform::CPUPlace()), weight_data, Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); @@ -100,19 +114,32 @@ class FcOpConverter : public OpConverter { // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // handle `mul`, leave `add` as another layer. // DEBUG - TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + float* bias_data = nullptr; + int bias_num = 0; + if (with_bias) { + auto* b_v = scope.FindVar(op_desc.Input("Bias").front()); + auto* b_t = b_v->GetMutable(); + bias_data = + engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false); + bias_num = b_t->numel(); + } + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, + static_cast(bias_data), + static_cast(bias_num)}; auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *const_cast(X), n_output, tmp_weight.get(), bias.get()); + engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp); auto output_name = op_desc.Output("Out").front(); - layer->setName(("fc (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); - if (test_mode) { - engine_->DeclareOutput(output_name); + + RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode); + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index 3f6ed04c46..7753fda06c 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -76,15 +76,9 @@ class LeakyReluOpConverter : public OpConverter { engine_->weight_map.end()); engine_->weight_map[alpha_name] = std::move(alpha_tensor); - std::string layer_name = "leaky_relu (Output: "; auto output_name = op_desc.Output("Out")[0]; - output_layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, output_layer->getOutput(0)); - layer_name += output_name; - if (test_mode) { - engine_->DeclareOutput(output_name); - } - output_layer->setName((layer_name + ")").c_str()); + RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name}, + test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 55515569ea..96a722dc89 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -172,6 +172,21 @@ class OpConverter { engine->FreezeNetwork(); } + void RreplenishLayerAndOutput( + nvinfer1::ILayer* layer, const std::string& layer_type, + const std::vector& output_tensor_names, + bool test_mode = false) { + size_t num_out = output_tensor_names.size(); + for (size_t i = 0; i < num_out; i++) { + layer->getOutput(i)->setName(output_tensor_names[i].c_str()); + engine_->SetITensor(output_tensor_names[i], layer->getOutput(i)); + if (test_mode) { + engine_->DeclareOutput(output_tensor_names[i]); + } + } + layer->setName( + (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str()); + } void SetEngine(TensorRTEngine* engine) { engine_ = engine; } virtual ~OpConverter() {} diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 4afcb0aece..bcd2166728 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -51,13 +51,7 @@ class PadOpConverter : public OpConverter { PADDLE_ENFORCE(layer != nullptr); auto output_name = op_desc.Output("Out")[0]; - engine_->SetITensor(output_name, layer->getOutput(0)); - layer->setName(("scale (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - if (test_mode) { // the test framework can not determine which is the - // output, so place the declaration inside. - engine_->DeclareOutput(output_name); - } + RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 1d0d83d1f3..1752c52c3f 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -148,11 +148,13 @@ class Pool2dOpConverter : public OpConverter { } auto output_name = op_desc.Output("Out")[0]; - layer->setName(("pool2d (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { - engine_->DeclareOutput(output_name); + RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 2ae804106e..01bcd03e52 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -58,15 +58,8 @@ class PReluOpConverter : public OpConverter { engine_->weight_map[op_desc.Input("Alpha")[0]] = std::move(alpha_tensor_temp); - std::string layer_name = "prelu (Output: "; auto output_name = op_desc.Output("Out")[0]; - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); - layer_name += output_name; - if (test_mode) { - engine_->DeclareOutput(output_name); - } - layer->setName((layer_name + ")").c_str()); + RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 80bfb2d190..b0ae169412 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -34,9 +34,13 @@ class SoftMaxOpConverter : public OpConverter { *const_cast(input1)); auto output_name = op_desc.Output("Out")[0]; - engine_->SetITensor(output_name, layer->getOutput(0)); - if (test_mode) { - engine_->DeclareOutput(output_name); + RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode); + + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 1856060cec..388d83d834 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -40,8 +40,7 @@ namespace tensorrt { * Get a random float value between [low, high] */ float random(float low, float high) { - static std::random_device rd; - static std::mt19937 mt(rd()); + static std::mt19937 mt(100); std::uniform_real_distribution dist(low, high); return dist(mt); } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index fddf5f11c2..c5ac6f3841 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -53,10 +53,40 @@ void TensorRTEngine::FreezeNetwork() { infer_builder_->setMaxWorkspaceSize(max_workspace_); if (enable_int8_) { infer_builder_->setInt8Mode(true); - PADDLE_ENFORCE( - calibrator_ != nullptr, - "The precision mode is 'INT8', the calibrator should not be nullptr"); - infer_builder_->setInt8Calibrator(calibrator_); + if (calibrator_) { + infer_builder_->setInt8Calibrator(calibrator_); + } else { + infer_builder_->setInt8Calibrator(nullptr); + +#if IS_TRT_VERSION_GE(5000) + infer_builder_->setStrictTypeConstraints(true); + for (auto &quant_range : quant_dynamic_range_) { + auto tensor = quant_range.first; + float range = quant_range.second; + tensor->setDynamicRange(-range, range); + } + + std::unordered_set all_t; + for (int i = 0; i < infer_network_->getNbLayers(); i++) { + auto layer = infer_network_->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) { + all_t.insert(layer->getOutput(j)); + } + } + for (int i = 0; i < infer_network_->getNbInputs(); i++) { + all_t.insert(infer_network_->getInput(i)); + } + + for (auto &t : all_t) { + if (!quant_dynamic_range_.count(t)) { + LOG(WARNING) + << "We are in trt int8 mode(not calibration), scale not setted" + << " for tensor " << t->getName() + << ", this might be ok when trt does not need this range"; + } + } +#endif + } } infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); @@ -133,6 +163,47 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { runtime_batch_ = batch_size; } +float *TensorRTEngine::GetWeightCPUData(const std::string &name, + framework::Tensor *weight_tensor, + bool enable_int8, + const std::vector &scale) { + auto w_dims = weight_tensor->dims(); + platform::CPUPlace cpu_place; + PADDLE_ENFORCE(!weight_map.count(name), + "During TRT Op converter: We set weight %s with the same name " + "twice into the weight_map", + name); + weight_map[name].reset(new framework::Tensor()); + weight_map[name]->Resize(weight_tensor->dims()); + TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get()); + float *weight_data = weight_map[name]->mutable_data(cpu_place); + + if (enable_int8) { + // when the op is fc, scale's size should be 1 + // when the op is conv, the scale's size should be w_dims[0] + bool valid_scale_size = + (scale.size() == 1 || scale.size() == static_cast(w_dims[0])); + PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size"); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + if (scale.size() == 1) { + weight_data[i] *= (scale[0] / 127); + } else { + PADDLE_ENFORCE(w_dims.size() == 4, + "TRT int8 quant : We only use the channel quant for " + "conv op, so the weight dims should be 4."); + int inner_size = w_dims[1] * w_dims[2] * w_dims[3]; + weight_data[i] *= (scale[i / inner_size] / 127); + } + } + } + return weight_data; +} + int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 657dfd9355..0396b084b8 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -18,8 +18,10 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -131,6 +133,13 @@ class TensorRTEngine { int GetDeviceId() { return device_id_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { + quant_dynamic_range_[tensor] = range; + } + + float* GetWeightCPUData(const std::string& name, + framework::Tensor* weight_tensor, bool enable_int8, + const std::vector& scale = {}); // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. @@ -184,8 +193,13 @@ class TensorRTEngine { infer_ptr infer_engine_; infer_ptr infer_context_; infer_ptr ihost_memory_; + std::unordered_map quant_dynamic_range_; }; // class TensorRTEngine +#define IS_TRT_VERSION_GE(version) \ + ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) + // Add an layer__ into engine__ with args ARGS. // For example: // diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 9fecad6eb3..8a5aed5d43 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -32,7 +32,7 @@ struct SimpleOpTypeSetTeller : public Teller { {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", - "conv2d_transpose", "leaky_relu"}}; + "conv2d_transpose", "leaky_relu", "fc"}}; }; bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h index b98f052bf2..3363d77af8 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.h +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once +#include #include +#include #include #include "paddle/fluid/framework/op_desc.h" diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 7f470924b3..1c32368e9d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -48,6 +48,7 @@ class TensorRTEngineOp : public framework::OperatorBase { int workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; + bool use_calib_mode_; std::string calibration_data_; std::string engine_key_; std::string engine_serialized_data_; @@ -65,6 +66,7 @@ class TensorRTEngineOp : public framework::OperatorBase { workspace_size_ = Attr("workspace_size"); device_id_ = Attr("gpu_id"); enable_int8_ = Attr("enable_int8"); + use_calib_mode_ = Attr("use_calib_mode"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); engine_serialized_data_ = Attr("engine_serialized_data"); @@ -75,7 +77,8 @@ class TensorRTEngineOp : public framework::OperatorBase { } // calibration_mode is ture represents we need to // generate the calibration table data. - calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); + calibration_mode_ = + (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_); VLOG(4) << "calibration_mode: " << calibration_mode_; if (enable_int8_ && calibration_data_.size()) { diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index cc4d8d6e6f..b39508a34d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -104,6 +104,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("engine_key", std::string("a_engine")); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("engine_key", std::string("b_engine")); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b650225c64..8ec9806f5f 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -229,7 +229,7 @@ void BindAnalysisConfig(py::module *m) { py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, - py::arg("use_static") = true) + py::arg("use_static") = true, py::arg("use_calib_mode") = false) .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine, py::arg("max_batch_size") = 1, py::arg("max_input_shape") = -- GitLab