未验证 提交 61221ebc 编写于 作者: Z Zhaolong Xing 提交者: GitHub

TRT: Support set dynamic range in int8 mode. (#17524)

* fluid int8 train and trt int8 predict align.
trt int8 predict init
op converter

* 2. align fluid int8 train and trt int8 inference.
enhance quant dequant fuse pass
enhance op converter, trt engine, trt engine op, trt subgraph pass.

* 3. add delete_quant_dequant_pass for trt

test=develop

* 4. add the missing file
test=develop

* 5. i modify the c++ interface, but forget to modify the pybind code
fix the IS_TRT_VERSION_GE bug, and fix elementwise op converter
test=develop
上级 0c39b97b
...@@ -75,6 +75,7 @@ pass_library(runtime_context_cache_pass base) ...@@ -75,6 +75,7 @@ pass_library(runtime_context_cache_pass base)
pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(quant_conv2d_dequant_fuse_pass inference)
pass_library(fillconstant_elementwisemul_fuse inference) pass_library(fillconstant_elementwisemul_fuse inference)
pass_library(shuffle_channel_detect_pass inference) pass_library(shuffle_channel_detect_pass inference)
pass_library(delete_quant_dequant_op_pass inference)
if(ANAKIN_FOUND) if(ANAKIN_FOUND)
pass_library(simplify_anakin_priorbox_detection_out_pass inference) pass_library(simplify_anakin_priorbox_detection_out_pass inference)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(any_op_out); \
GET_IR_NODE(quant_dequant_op_inscale); \
GET_IR_NODE(quant_dequant_op); \
GET_IR_NODE(quant_dequant_op_outscale); \
GET_IR_NODE(quant_dequant_op_out); \
GET_IR_NODE(any_op2);
void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "delete_quantdequant_op_pattern";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
pattern_name);
pattern();
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_NODES;
IR_NODE_LINK_TO(any_op_out, any_op2);
std::string any_op_out_name = any_op_out->Var()->Name();
std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
auto* any_op2_desc = any_op2->Op();
// auto input_args_names = any_op2_desc->InputArgumentNames();
auto var_map = any_op2_desc->Inputs();
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(), name_m.second.end(),
quant_dequant_op_out_name) != name_m.second.end()) {
std::vector<std::string> new_inputs;
for (auto& i_n : name_m.second) {
if (i_n != quant_dequant_op_out_name) {
new_inputs.push_back(i_n);
}
}
new_inputs.push_back(any_op_out_name);
any_op2_desc->SetInput(name_m.first, new_inputs);
any_op2_desc->Flush();
}
}
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph,
{quant_dequant_op, quant_dequant_op_out,
quant_dequant_op_inscale, quant_dequant_op_outscale});
};
gpd(graph, handler);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(delete_quant_dequant_op_pass,
paddle::framework::ir::DeleteQuantDequantOpPass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class DeleteQuantDequantOpPass : public FusePassBase {
public:
virtual ~DeleteQuantDequantOpPass() {}
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -78,6 +78,11 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -78,6 +78,11 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8")); desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8"));
desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale")); desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale"));
desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale")); desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale"));
if (base_op_desc->HasAttr("out_scale"))
desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale"));
auto elementwise_desc = elementwise_add->Op();
if (elementwise_desc->HasAttr("out_scale"))
desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale"));
} }
desc.SetType("fc"); desc.SetType("fc");
......
...@@ -1738,13 +1738,16 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, ...@@ -1738,13 +1738,16 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const std::string &op_type, const std::string &op_type,
const std::string &weight_name, const std::string &weight_name,
int times, int times,
const std::string &quant_type) { const std::string &quant_type,
const int kNumFields = 5; const std::string &dequant_type) {
int kNumFields = 5;
const int kQuantizedWeightOffset = 0; const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1; const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2; const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3; const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4; const int kDequantOpOutOffset = 4;
const int kDequantOpWeightScaleOffset = 5;
// the quant op always be one. // the quant op always be one.
auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale")) auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
->assert_is_op_input(quant_type, "InScale") ->assert_is_op_input(quant_type, "InScale")
...@@ -1752,11 +1755,19 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, ...@@ -1752,11 +1755,19 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
auto quant_op = auto quant_op =
pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type); pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
auto quant_op_out_scale = PDNode *quant_op_out_scale = nullptr;
pattern->NewNode(GetNodeName("quant_op_out_scale")) if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
->assert_is_op_output(quant_type, "OutScale") kNumFields += 1;
->assert_is_op_input("fake_dequantize_max_abs", "Scale") quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
->AsIntermediate(); ->assert_is_op_output(quant_type, "OutScale")
->assert_is_op_nth_input(dequant_type, "Scales", 1)
->AsIntermediate();
} else {
quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
->assert_is_op_output(quant_type, "OutScale")
->assert_is_op_input(dequant_type, "Scale")
->AsIntermediate();
}
auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out")) auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
->assert_is_op_output(quant_type, "Out") ->assert_is_op_output(quant_type, "Out")
...@@ -1777,16 +1788,25 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, ...@@ -1777,16 +1788,25 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
nodes.push_back( nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i)) pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
->assert_is_op_output(op_type) ->assert_is_op_output(op_type)
->assert_is_op_input("fake_dequantize_max_abs", "X") ->assert_is_op_input(dequant_type, "X")
->AsIntermediate()); ->AsIntermediate());
nodes.push_back( nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i)) pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
->assert_is_op("fake_dequantize_max_abs")); ->assert_is_op(dequant_type));
nodes.push_back( nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i)) pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
->assert_is_op_output("fake_dequantize_max_abs", "Out") ->assert_is_op_output(dequant_type, "Out")
->AsOutput()); ->AsOutput());
if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
nodes.push_back(pattern
->NewNode(GetNodeName("dequant_channel_scale") +
std::to_string(i))
->assert_is_op_nth_input(dequant_type, "Scales", 0)
->AsInput());
}
} }
quant_op->LinksFrom({quant_op_input, quant_op_in_scale}); quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
...@@ -1796,8 +1816,14 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, ...@@ -1796,8 +1816,14 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
{quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]}); {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom( nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOffset]}); {nodes[i * kNumFields + kQuantizedOpOffset]});
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale,
nodes[i * kNumFields + kDequantOpWeightScaleOffset]});
} else {
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
}
nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom( nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kDequantOpOffset]}); {nodes[i * kNumFields + kDequantOpOffset]});
} }
...@@ -1834,6 +1860,41 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) { ...@@ -1834,6 +1860,41 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
reshape2_out->LinksFrom({reshape2_op}); reshape2_out->LinksFrom({reshape2_op});
} }
void patterns::DeleteQuantDequantOpPattern::operator()() {
auto any_op_out =
pattern->NewNode(any_op_out_repr())
->assert_is_op_input(
"fake_quantize_dequantize_moving_average_abs_max", "X")
->AsInput();
auto quant_dequant_op_inscale =
pattern->NewNode(quant_dequant_op_inscale_repr())
->assert_is_op_input(
"fake_quantize_dequantize_moving_average_abs_max", "InScale")
->AsInput();
auto quant_dequant_op =
pattern->NewNode(quant_dequant_op_repr())
->assert_is_op("fake_quantize_dequantize_moving_average_abs_max");
auto quant_dequant_out =
pattern->NewNode(quant_dequant_op_out_repr())
->assert_is_op_output(
"fake_quantize_dequantize_moving_average_abs_max", "Out")
->AsIntermediate();
auto quant_dequant_op_outscale =
pattern->NewNode(quant_dequant_op_outscale_repr())
->assert_is_op_output(
"fake_quantize_dequantize_moving_average_abs_max", "OutScale")
->AsOutput();
auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale});
quant_dequant_op_outscale->LinksFrom({quant_dequant_op});
quant_dequant_out->LinksFrom({quant_dequant_op});
any_op2->LinksFrom({quant_dequant_out});
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -954,7 +954,8 @@ struct QuantDequantOpFuse : public PatternBase { ...@@ -954,7 +954,8 @@ struct QuantDequantOpFuse : public PatternBase {
void operator()(PDNode* quant_op_input, const std::string& op_name, void operator()(PDNode* quant_op_input, const std::string& op_name,
const std::string& weight_name, int times, const std::string& weight_name, int times,
const std::string& quant_type); const std::string& quant_type,
const std::string& dequant_type);
std::string GetNodeName(const std::string& op_type) { std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type); return PDNodeName(name_scope_, repr_, id_, op_type);
...@@ -980,6 +981,20 @@ struct ShuffleChannelPattern : public PatternBase { ...@@ -980,6 +981,20 @@ struct ShuffleChannelPattern : public PatternBase {
PATTERN_DECL_NODE(reshape2_out); PATTERN_DECL_NODE(reshape2_out);
}; };
struct DeleteQuantDequantOpPattern : public PatternBase {
DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
void operator()();
PATTERN_DECL_NODE(any_op_out);
PATTERN_DECL_NODE(quant_dequant_op_inscale);
PATTERN_DECL_NODE(quant_dequant_op);
PATTERN_DECL_NODE(quant_dequant_op_outscale);
PATTERN_DECL_NODE(quant_dequant_op_out);
PATTERN_DECL_NODE(any_op2);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -25,16 +25,20 @@ namespace framework { ...@@ -25,16 +25,20 @@ namespace framework {
namespace ir { namespace ir {
void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
const std::string& op_type, const std::string& op_type, const std::string& quant_type,
const std::string& quant_type) { const std::string& dequant_type) {
const std::string pattern_name = "quant_dequant_fuse"; const std::string pattern_name = "quant_dequant_fuse";
// FusePassBase::Init(pattern_name, graph); int kNumFields = 5;
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0; const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1; const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2; const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3; const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4; const int kDequantOpOutOffset = 4;
const int kDequantOpWeightScaleOffset = 5;
if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
kNumFields += 1;
}
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern() auto* x = gpd.mutable_pattern()
...@@ -42,22 +46,14 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ...@@ -42,22 +46,14 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
->assert_is_op_input(quant_type, "X") ->assert_is_op_input(quant_type, "X")
->AsInput(); ->AsInput();
std::string quantized_op_type = ""; std::string quantized_op_type = op_type;
std::string weight_name = ""; std::string weight_name = "";
if (op_type == "conv2d") { if (op_type == "conv2d" || op_type == "depthwise_conv2d" ||
quantized_op_type = "conv2d"; op_type == "conv2d_fusion") {
weight_name = "Filter";
} else if (op_type == "depthwise_conv2d") {
quantized_op_type = "depthwise_conv2d";
weight_name = "Filter";
} else if (op_type == "conv2d_fusion") {
quantized_op_type = "conv2d_fusion";
weight_name = "Filter"; weight_name = "Filter";
} else if (op_type == "mul") { } else if (op_type == "mul") {
quantized_op_type = "mul";
weight_name = "Y"; weight_name = "Y";
} else if (op_type == "fc") { } else if (op_type == "fc") {
quantized_op_type = "fc";
weight_name = "W"; weight_name = "W";
} else { } else {
PADDLE_ENFORCE( PADDLE_ENFORCE(
...@@ -66,7 +62,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ...@@ -66,7 +62,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
} }
patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name); patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
pattern(x, quantized_op_type, weight_name, times, quant_type); pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
...@@ -91,6 +87,10 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ...@@ -91,6 +87,10 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i)))); subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
nodes.push_back( nodes.push_back(
subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i)))); subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
nodes.push_back(subgraph.at(
pattern.GetPDNode("dequant_channel_scale" + std::to_string(i))));
}
} }
int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length")); int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
...@@ -107,10 +107,31 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ...@@ -107,10 +107,31 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std::unordered_set<const Node*> delete_nodes; std::unordered_set<const Node*> delete_nodes;
for (int i = 0; i < times; i++) { for (int i = 0; i < times; i++) {
float max_range = boost::get<float>( std::vector<float> weight_scale;
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
float weight_scale = (range * range) / max_range; // Get weight scale from dequant op.
if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
auto scales_name =
nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales");
PADDLE_ENFORCE(scales_name.size() == 2);
const LoDTensor& channel_scale_tensor =
scope->FindVar(scales_name[0])->Get<LoDTensor>();
PADDLE_ENFORCE(
paddle::platform::is_cpu_place(channel_scale_tensor.place()));
const float* channel_scale_data = channel_scale_tensor.data<float>();
for (int i = 0; i < channel_scale_tensor.numel(); i++) {
weight_scale.push_back(channel_scale_data[i]);
}
delete_nodes.insert(
nodes[i * kNumFields + kDequantOpWeightScaleOffset]);
} else {
float max_range = boost::get<float>(
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr(
"max_range"));
weight_scale.push_back((range * range) / max_range);
}
// create new op_desc
auto base_op_desc = auto base_op_desc =
*nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto(); *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
std::string new_input = input_node->Name(); std::string new_input = input_node->Name();
...@@ -141,6 +162,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, ...@@ -141,6 +162,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
IR_NODE_LINK_TO(input_node, new_op); IR_NODE_LINK_TO(input_node, new_op);
IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op); IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]); IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]); delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]); delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]); delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
...@@ -160,16 +182,19 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -160,16 +182,19 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "quant_dequant_fuse"; const std::string pattern_name = "quant_dequant_fuse";
FusePassBase::Init(pattern_name, graph); FusePassBase::Init(pattern_name, graph);
std::unordered_set<std::string> dequant_types = {
"fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"};
std::unordered_set<std::string> quant_types = { std::unordered_set<std::string> quant_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul", std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
"depthwise_conv2d"}; "depthwise_conv2d"};
auto* scope = param_scope(); auto* scope = param_scope();
for (auto& quant_type : quant_types) { for (auto& dequant_type : dequant_types) {
for (auto& op_type : quantized_op_types) { for (auto& quant_type : quant_types) {
for (int i = 6; i >= 1; i--) { for (auto& op_type : quantized_op_types) {
RunQuantDequant(graph, scope, i, op_type, quant_type); for (int i = 6; i >= 1; i--) {
RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type);
}
} }
} }
} }
......
...@@ -70,7 +70,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()( ...@@ -70,7 +70,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
if (enable_int8) { if (enable_int8) {
const float int8_range = 127.; const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale")); float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale")); auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
PBlock<TargetT> *weight1 = PBlock<TargetT> *weight1 =
new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8); new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
this->engine_->RegistBlock(weight1); this->engine_->RegistBlock(weight1);
...@@ -91,8 +92,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()( ...@@ -91,8 +92,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name, this->engine_->Graph()->SetWeightsScale(
{weight_scale / int8_range}, false); op_name, {weight_scale[0] / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range); this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else { } else {
auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>( auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>(
......
...@@ -72,7 +72,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()( ...@@ -72,7 +72,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
if (enable_int8) { if (enable_int8) {
const float int8_range = 127.; const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale")); float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale")); auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
PBlock<TargetT> *weight1 = PBlock<TargetT> *weight1 =
new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8); new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
this->engine_->RegistBlock(weight1); this->engine_->RegistBlock(weight1);
...@@ -93,8 +94,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()( ...@@ -93,8 +94,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name, this->engine_->Graph()->SetWeightsScale(
{weight_scale / int8_range}, false); op_name, {weight_scale[0] / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range); this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else { } else {
auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
......
...@@ -76,7 +76,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()( ...@@ -76,7 +76,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
::anakin::saber::Shape anakin_shape(weight_shape); ::anakin::saber::Shape anakin_shape(weight_shape);
const float int8_range = 127.; const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale")); float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale")); auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
PBlock<TargetT> *weight1 = PBlock<TargetT> *weight1 =
new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8); new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
this->engine_->RegistBlock(weight1); this->engine_->RegistBlock(weight1);
...@@ -95,8 +96,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()( ...@@ -95,8 +96,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name, this->engine_->Graph()->SetWeightsScale(
{weight_scale / int8_range}, false); op_name, {weight_scale[0] / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range); this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else { } else {
auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(trans_weight_data, auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(trans_weight_data,
......
...@@ -164,6 +164,7 @@ struct Argument { ...@@ -164,6 +164,7 @@ struct Argument {
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool); bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
anakin_max_shape_t); anakin_max_shape_t);
......
...@@ -87,7 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -87,7 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument,
bool enable_int8 = argument->tensorrt_precision_mode() == bool enable_int8 = argument->tensorrt_precision_mode() ==
AnalysisConfig::Precision::kInt8; AnalysisConfig::Precision::kInt8;
bool use_calib_mode = argument->tensorrt_use_calib_mode();
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
pass->Set("use_calib_mode", new bool(use_calib_mode));
bool use_static_engine = argument->tensorrt_use_static_engine(); bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory(); bool model_from_memory = argument->model_from_memory();
......
...@@ -61,7 +61,7 @@ void RenameAndGetOutputs( ...@@ -61,7 +61,7 @@ void RenameAndGetOutputs(
std::set<std::string> *output_names, std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map, std::unordered_map<std::string, std::string> *output_name_map,
const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map, const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
bool is_trt) { bool trt_and_not_int8) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet. //// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the // When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv // paddle-tensorrt will do the merging optimization, which fuse those conv
...@@ -121,7 +121,7 @@ void RenameAndGetOutputs( ...@@ -121,7 +121,7 @@ void RenameAndGetOutputs(
for (auto out_var : correspond_node->outputs) { for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id(); var2id[out_var->Name()] = out_var->id();
} }
if (op_desc.Type() == "conv2d" && is_trt) { if (op_desc.Type() == "conv2d" && trt_and_not_int8) {
auto input_var_name = op_desc.Input("Input").front(); auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front(); auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front(); auto out_var_name = op_desc.Output("Output").front();
......
...@@ -43,7 +43,7 @@ void RenameAndGetOutputs( ...@@ -43,7 +43,7 @@ void RenameAndGetOutputs(
std::set<std::string> *output_names, std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map, std::unordered_map<std::string, std::string> *output_name_map,
const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map, const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
bool is_trt = true); bool trt_and_not_int8 = false);
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -149,6 +149,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -149,6 +149,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
graph_var_map[node->Name()] = node; graph_var_map[node->Name()] = node;
} }
} }
auto enable_int8 = Get<bool>("enable_int8");
auto use_calib_mode = Get<bool>("use_calib_mode");
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate // The following procedure is used to rename all the intermediate
...@@ -165,7 +167,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -165,7 +167,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map, &output_names_with_id, &output_names, &output_name_map,
graph_var_map); graph_var_map, !enable_int8);
// When tensorrt engine runs at the end of the operation, // When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor // output_mapping help us copy the data from the renamed ITensor
...@@ -196,7 +198,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -196,7 +198,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params); SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8");
auto use_static_engine = Get<bool>("use_static_engine"); auto use_static_engine = Get<bool>("use_static_engine");
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0)); std::to_string(0));
...@@ -204,13 +205,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -204,13 +205,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// Get "" when there is no cached calibration table data. // Get "" when there is no cached calibration table data.
bool load_from_memory = Get<bool>("model_from_memory"); bool load_from_memory = Get<bool>("model_from_memory");
std::string calibration_data = ""; std::string calibration_data = "";
if (enable_int8) { if (enable_int8 && use_calib_mode) {
calibration_data = GetTrtCalibTableData( calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8); Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
} }
SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
std::string trt_engine_serialized_data = ""; std::string trt_engine_serialized_data = "";
SetAttr(op_desc->Proto(), "engine_serialized_data", SetAttr(op_desc->Proto(), "engine_serialized_data",
...@@ -222,7 +224,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -222,7 +224,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
} }
// When in int8 mode and calibration_mode, the program just produce the // When in int8 mode and calibration_mode, the program just produce the
// calibration table data. // calibration table data.
bool calibration_mode = (enable_int8 && calibration_data.size() == 0); bool calibration_mode =
(enable_int8 && calibration_data.size() == 0 && use_calib_mode);
if (calibration_mode) { if (calibration_mode) {
// calibraion mode means generate int8 calibration table data process. // calibraion mode means generate int8 calibration table data process.
return; return;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
extern const std::vector<std::string> kTRTSubgraphPasses;
extern const std::vector<std::string> kAnakinSubgraphPasses; extern const std::vector<std::string> kAnakinSubgraphPasses;
PassStrategy *AnalysisConfig::pass_builder() const { PassStrategy *AnalysisConfig::pass_builder() const {
...@@ -105,6 +106,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -105,6 +106,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_); CP_MEMBER(tensorrt_precision_mode_);
CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_);
// MKLDNN related. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -177,7 +179,8 @@ std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config() ...@@ -177,7 +179,8 @@ std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
void AnalysisConfig::EnableTensorRtEngine( void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size, int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode, bool use_static) { AnalysisConfig::Precision precision_mode, bool use_static,
bool use_calib_mode) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
...@@ -190,6 +193,7 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -190,6 +193,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode; tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static; trt_use_static_engine_ = use_static;
trt_use_calib_mode_ = use_calib_mode;
Update(); Update();
#else #else
...@@ -228,13 +232,10 @@ void AnalysisConfig::Update() { ...@@ -228,13 +232,10 @@ void AnalysisConfig::Update() {
} }
if (use_tensorrt_) { if (use_tensorrt_) {
const auto &passes = pass_builder_->AllPasses(); pass_builder()->ClearPasses();
if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") == for (const auto &pass : kTRTSubgraphPasses) {
std::end(passes)) { pass_builder()->AppendPass(pass);
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
pass_builder()->DeletePass("runtime_context_cache_pass");
} }
if (use_mkldnn_) { if (use_mkldnn_) {
......
...@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
} }
if (config_.anakin_engine_enabled()) { if (config_.anakin_engine_enabled()) {
......
...@@ -142,7 +142,8 @@ struct AnalysisConfig { ...@@ -142,7 +142,8 @@ struct AnalysisConfig {
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3, int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32, Precision precision = Precision::kFloat32,
bool use_static = false); bool use_static = false,
bool use_calib_mode = false);
/** A boolean state telling whether the TensorRT engine is used. /** A boolean state telling whether the TensorRT engine is used.
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
...@@ -266,6 +267,7 @@ struct AnalysisConfig { ...@@ -266,6 +267,7 @@ struct AnalysisConfig {
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_; Precision tensorrt_precision_mode_;
bool trt_use_static_engine_; bool trt_use_static_engine_;
bool trt_use_calib_mode_;
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
......
...@@ -70,6 +70,24 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { ...@@ -70,6 +70,24 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); } void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({
"infer_clean_graph_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_quant_dequant_op_pass", //
// "fc_fuse_pass", //
"tensorrt_subgraph_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif //
"transpose_flatten_concat_fuse_pass",
});
// The following passes works for Anakin sub-graph engine. // The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({ const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
......
...@@ -142,6 +142,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -142,6 +142,7 @@ class GpuPassStrategy : public PassStrategy {
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
extern const std::vector<std::string> kTRTSubgraphPasses;
extern const std::vector<std::string> kAnakinSubgraphPasses; extern const std::vector<std::string> kAnakinSubgraphPasses;
} // namespace paddle } // namespace paddle
...@@ -43,12 +43,13 @@ class ActivationOpConverter : public OpConverter { ...@@ -43,12 +43,13 @@ class ActivationOpConverter : public OpConverter {
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor), engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
op_pair->second); op_pair->second);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str()); RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
engine_->SetITensor(output_name, layer->getOutput(0)); if (op_desc.HasAttr("out_scale")) {
if (test_mode) { // the test framework can not determine which is the #if IS_TRT_VERSION_GE(5000)
// output, so place the declaration inside. float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->DeclareOutput(output_name); engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
} }
} }
......
...@@ -116,18 +116,12 @@ class BatchNormOpConverter : public OpConverter { ...@@ -116,18 +116,12 @@ class BatchNormOpConverter : public OpConverter {
scale_weights.get(), power_weights.get()); scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front(); auto output_name = op_desc.Output("Y").front();
layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Bias").front()] = engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor); std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] = engine_->weight_map[op_desc.Input("Scale").front()] =
std::move(combile_scale_tensor); std::move(combile_scale_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
if (test_mode) {
engine_->DeclareOutput(output_name);
}
} }
}; };
......
...@@ -42,13 +42,7 @@ class ConcatOpConverter : public OpConverter { ...@@ -42,13 +42,7 @@ class ConcatOpConverter : public OpConverter {
axis = axis - 1; // Remove batch dim axis = axis - 1; // Remove batch dim
layer->setAxis(axis); layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("concat (Output: " + output_name + ")").c_str()); RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
} }
}; };
......
...@@ -32,25 +32,31 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -32,25 +32,31 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
PADDLE_ENFORCE(engine != nullptr); PADDLE_ENFORCE(engine != nullptr);
auto* X = engine->GetITensor(op_desc.Input("Input").front()); auto* X = engine->GetITensor(op_desc.Input("Input").front());
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
float* weight_data = nullptr;
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t,
true, weight_scale);
engine->SetTensorDynamicRange(X, in_scale);
#endif
} else {
weight_data =
engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false);
}
platform::CPUPlace cpu_place; PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
std::unique_ptr<framework::LoDTensor> weight_tensor( const int n_output = Y_t->dims()[0];
new framework::LoDTensor()); const int n_input = Y_t->dims()[1];
weight_tensor->Resize(Y_t->dims()); const int filter_h = Y_t->dims()[2];
TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); const int filter_w = Y_t->dims()[3];
auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0];
const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
const int groups = boost::get<int>(op_desc.GetAttr("groups")); const int groups = boost::get<int>(op_desc.GetAttr("groups"));
const std::vector<int> dilations = const std::vector<int> dilations =
boost::get<std::vector<int>>(op_desc.GetAttr("dilations")); boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
...@@ -66,7 +72,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -66,7 +72,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
static_cast<size_t>(weight_tensor->numel())}; static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input, auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
...@@ -80,11 +86,16 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -80,11 +86,16 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
auto output_name = op_desc.Output("Output").front(); auto output_name = op_desc.Output("Output").front();
layer->setName((name + " (Output: " + output_name + ")").c_str()); layer->setName((name + " (Output: " + output_name + ")").c_str());
engine->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0)); engine->SetITensor(output_name, layer->getOutput(0));
#if IS_TRT_VERSION_GE(5000)
if (enable_int8) {
float output_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine->SetTensorDynamicRange(layer->getOutput(0), output_scale);
}
#endif
if (test_mode) { if (test_mode) {
engine->DeclareOutput(output_name); engine->DeclareOutput(output_name);
} }
......
...@@ -55,11 +55,8 @@ class DropoutOpConverter : public OpConverter { ...@@ -55,11 +55,8 @@ class DropoutOpConverter : public OpConverter {
engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
std::move(weight_tensor); std::move(weight_tensor);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("dropout (Output: " + output_name + ")").c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
if (test_mode) {
engine_->DeclareOutput(output_name);
}
} }
}; };
......
...@@ -55,17 +55,13 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -55,17 +55,13 @@ class ElementwiseWeightOpConverter : public OpConverter {
auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
float* weight_data = nullptr;
weight_data =
engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data =
weight_tensor->mutable_data<float>(platform::CPUPlace());
auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims()); std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) { if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
} }
...@@ -92,9 +88,9 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -92,9 +88,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
} }
TensorRTEngine::Weight shift_weights{ TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data), static_cast<void*>(weight_data),
weight_tensor->memory_size() / sizeof(float)}; static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0}; 0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
...@@ -112,14 +108,13 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -112,14 +108,13 @@ class ElementwiseWeightOpConverter : public OpConverter {
} }
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName( RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str()); test_mode);
layer->getOutput(0)->setName(output_name.c_str()); if (op_desc.HasAttr("out_scale")) {
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); #if IS_TRT_VERSION_GE(5000)
engine_->SetITensor(output_name, layer->getOutput(0)); float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
if (test_mode) { // the test framework can not determine which is the engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
// output, so place the declaration inside. #endif
engine_->DeclareOutput(output_name);
} }
} }
...@@ -138,6 +133,7 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -138,6 +133,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
...@@ -153,13 +149,11 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -153,13 +149,11 @@ class ElementwiseTensorOpConverter : public OpConverter {
if (CheckDims(dims_x, dims_y)) { if (CheckDims(dims_x, dims_y)) {
// The two input tensor should have the same dims // The two input tensor should have the same dims
VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X), engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second); *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
layer->setName(("elementwise (Output: " + output_name + ")").c_str()); layer = elet_layer;
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
} else { } else {
VLOG(3) << "Convert a fluid elementwise op to TensorRT " VLOG(3) << "Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer"; "ElementWisePluginLayer";
...@@ -168,17 +162,18 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -168,17 +162,18 @@ class ElementwiseTensorOpConverter : public OpConverter {
new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
plugin->AddInput(X); plugin->AddInput(X);
plugin->AddInput(Y); plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin( nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin(
const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2, const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
reinterpret_cast<plugin::PluginTensorRT*>(plugin)); reinterpret_cast<plugin::PluginTensorRT*>(plugin));
layer->setName(("elementwise (Output: " + output_name + ")").c_str()); layer = plugin_layer;
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
} }
if (test_mode) { // the test framework can not determine which is the RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
// output, so place the declaration inside. if (op_desc.HasAttr("out_scale")) {
engine_->DeclareOutput(output_name); #if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
} }
} }
......
...@@ -53,33 +53,47 @@ class FcOpConverter : public OpConverter { ...@@ -53,33 +53,47 @@ class FcOpConverter : public OpConverter {
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight auto input_names = op_desc.InputNames();
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); bool with_bias = input_names.size() >= 3;
std::string w_name = "Y";
std::string i_name = "X";
if (with_bias) {
w_name = "W";
i_name = "Input";
}
// Declare inputs // Declare inputs
auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
// Declare weights // Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
// This may trigger a GPU->CPU copy, because TRT's weight can only be // This may trigger a GPU->CPU copy, because TRT's weight can only be
// assigned from CPU memory, that can't be avoided. // assigned from CPU memory, that can't be avoided.
platform::CPUPlace cpu_place; float* weight_data = nullptr;
framework::LoDTensor weight_tensor; bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
weight_tensor.Resize(Y_t->dims()); if (enable_int8) {
TensorCopySync((*Y_t), cpu_place, &weight_tensor); #if IS_TRT_VERSION_GE(5000)
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace()); auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
Y_t, true, weight_scale);
engine_->SetTensorDynamicRange(X, in_scale);
#endif
} else {
weight_data =
engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
}
PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL); // a matrix PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix
size_t n_output = weight_tensor.dims()[1]; size_t n_output = Y_t->dims()[1];
std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor()); std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
tmp->Resize(weight_tensor.dims()); tmp->Resize(Y_t->dims());
memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data, memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
...@@ -100,19 +114,32 @@ class FcOpConverter : public OpConverter { ...@@ -100,19 +114,32 @@ class FcOpConverter : public OpConverter {
// but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
// handle `mul`, leave `add` as another layer. // handle `mul`, leave `add` as another layer.
// DEBUG // DEBUG
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; float* bias_data = nullptr;
int bias_num = 0;
if (with_bias) {
auto* b_v = scope.FindVar(op_desc.Input("Bias").front());
auto* b_t = b_v->GetMutable<framework::LoDTensor>();
bias_data =
engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
bias_num = b_t->numel();
}
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<size_t>(bias_num)};
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
*const_cast<nvinfer1::ITensor*>(X), *const_cast<nvinfer1::ITensor*>(X),
n_output, tmp_weight.get(), bias.get()); n_output, tmp_weight.get(), bias.get());
engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp);
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
layer->setName(("fc (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str()); RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
engine_->SetITensor(output_name, layer->getOutput(0)); if (enable_int8) {
engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); #if IS_TRT_VERSION_GE(5000)
if (test_mode) { float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->DeclareOutput(output_name); engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
} }
} }
}; };
......
...@@ -76,15 +76,9 @@ class LeakyReluOpConverter : public OpConverter { ...@@ -76,15 +76,9 @@ class LeakyReluOpConverter : public OpConverter {
engine_->weight_map.end()); engine_->weight_map.end());
engine_->weight_map[alpha_name] = std::move(alpha_tensor); engine_->weight_map[alpha_name] = std::move(alpha_tensor);
std::string layer_name = "leaky_relu (Output: ";
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
output_layer->getOutput(0)->setName(output_name.c_str()); RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
engine_->SetITensor(output_name, output_layer->getOutput(0)); test_mode);
layer_name += output_name;
if (test_mode) {
engine_->DeclareOutput(output_name);
}
output_layer->setName((layer_name + ")").c_str());
} }
}; };
......
...@@ -172,6 +172,21 @@ class OpConverter { ...@@ -172,6 +172,21 @@ class OpConverter {
engine->FreezeNetwork(); engine->FreezeNetwork();
} }
void RreplenishLayerAndOutput(
nvinfer1::ILayer* layer, const std::string& layer_type,
const std::vector<std::string>& output_tensor_names,
bool test_mode = false) {
size_t num_out = output_tensor_names.size();
for (size_t i = 0; i < num_out; i++) {
layer->getOutput(i)->setName(output_tensor_names[i].c_str());
engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
if (test_mode) {
engine_->DeclareOutput(output_tensor_names[i]);
}
}
layer->setName(
(layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; } void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {} virtual ~OpConverter() {}
......
...@@ -51,13 +51,7 @@ class PadOpConverter : public OpConverter { ...@@ -51,13 +51,7 @@ class PadOpConverter : public OpConverter {
PADDLE_ENFORCE(layer != nullptr); PADDLE_ENFORCE(layer != nullptr);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0)); RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
layer->setName(("scale (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
} }
}; };
......
...@@ -148,11 +148,13 @@ class Pool2dOpConverter : public OpConverter { ...@@ -148,11 +148,13 @@ class Pool2dOpConverter : public OpConverter {
} }
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("pool2d (Output: " + output_name + ")").c_str()); RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); if (op_desc.HasAttr("out_scale")) {
if (test_mode) { #if IS_TRT_VERSION_GE(5000)
engine_->DeclareOutput(output_name); float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
} }
} }
}; };
......
...@@ -58,15 +58,8 @@ class PReluOpConverter : public OpConverter { ...@@ -58,15 +58,8 @@ class PReluOpConverter : public OpConverter {
engine_->weight_map[op_desc.Input("Alpha")[0]] = engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_temp); std::move(alpha_tensor_temp);
std::string layer_name = "prelu (Output: ";
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->getOutput(0)->setName(output_name.c_str()); RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
engine_->SetITensor(output_name, layer->getOutput(0));
layer_name += output_name;
if (test_mode) {
engine_->DeclareOutput(output_name);
}
layer->setName((layer_name + ")").c_str());
} }
}; };
......
...@@ -34,9 +34,13 @@ class SoftMaxOpConverter : public OpConverter { ...@@ -34,9 +34,13 @@ class SoftMaxOpConverter : public OpConverter {
*const_cast<nvinfer1::ITensor*>(input1)); *const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0)); RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode);
if (test_mode) {
engine_->DeclareOutput(output_name); if (op_desc.HasAttr("out_scale")) {
#if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
} }
} }
}; };
......
...@@ -40,8 +40,7 @@ namespace tensorrt { ...@@ -40,8 +40,7 @@ namespace tensorrt {
* Get a random float value between [low, high] * Get a random float value between [low, high]
*/ */
float random(float low, float high) { float random(float low, float high) {
static std::random_device rd; static std::mt19937 mt(100);
static std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(low, high); std::uniform_real_distribution<double> dist(low, high);
return dist(mt); return dist(mt);
} }
......
...@@ -53,10 +53,40 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -53,10 +53,40 @@ void TensorRTEngine::FreezeNetwork() {
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8_) { if (enable_int8_) {
infer_builder_->setInt8Mode(true); infer_builder_->setInt8Mode(true);
PADDLE_ENFORCE( if (calibrator_) {
calibrator_ != nullptr, infer_builder_->setInt8Calibrator(calibrator_);
"The precision mode is 'INT8', the calibrator should not be nullptr"); } else {
infer_builder_->setInt8Calibrator(calibrator_); infer_builder_->setInt8Calibrator(nullptr);
#if IS_TRT_VERSION_GE(5000)
infer_builder_->setStrictTypeConstraints(true);
for (auto &quant_range : quant_dynamic_range_) {
auto tensor = quant_range.first;
float range = quant_range.second;
tensor->setDynamicRange(-range, range);
}
std::unordered_set<nvinfer1::ITensor *> all_t;
for (int i = 0; i < infer_network_->getNbLayers(); i++) {
auto layer = infer_network_->getLayer(i);
for (int j = 0; j < layer->getNbOutputs(); j++) {
all_t.insert(layer->getOutput(j));
}
}
for (int i = 0; i < infer_network_->getNbInputs(); i++) {
all_t.insert(infer_network_->getInput(i));
}
for (auto &t : all_t) {
if (!quant_dynamic_range_.count(t)) {
LOG(WARNING)
<< "We are in trt int8 mode(not calibration), scale not setted"
<< " for tensor " << t->getName()
<< ", this might be ok when trt does not need this range";
}
}
#endif
}
} }
infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
...@@ -133,6 +163,47 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { ...@@ -133,6 +163,47 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size; runtime_batch_ = batch_size;
} }
float *TensorRTEngine::GetWeightCPUData(const std::string &name,
framework::Tensor *weight_tensor,
bool enable_int8,
const std::vector<float> &scale) {
auto w_dims = weight_tensor->dims();
platform::CPUPlace cpu_place;
PADDLE_ENFORCE(!weight_map.count(name),
"During TRT Op converter: We set weight %s with the same name "
"twice into the weight_map",
name);
weight_map[name].reset(new framework::Tensor());
weight_map[name]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get());
float *weight_data = weight_map[name]->mutable_data<float>(cpu_place);
if (enable_int8) {
// when the op is fc, scale's size should be 1
// when the op is conv, the scale's size should be w_dims[0]
bool valid_scale_size =
(scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
for (int i = 0; i < weight_tensor->numel(); i++) {
bool is_valid_int8 =
((weight_data[i] >= -128) && (weight_data[i] <= 127));
PADDLE_ENFORCE(is_valid_int8,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]");
if (scale.size() == 1) {
weight_data[i] *= (scale[0] / 127);
} else {
PADDLE_ENFORCE(w_dims.size() == 4,
"TRT int8 quant : We only use the channel quant for "
"conv op, so the weight dims should be 4.");
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
weight_data[i] *= (scale[i / inner_size] / 127);
}
}
}
return weight_data;
}
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
......
...@@ -18,8 +18,10 @@ limitations under the License. */ ...@@ -18,8 +18,10 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -131,6 +133,13 @@ class TensorRTEngine { ...@@ -131,6 +133,13 @@ class TensorRTEngine {
int GetDeviceId() { return device_id_; } int GetDeviceId() { return device_id_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int num_inputs, plugin::PluginTensorRT*); int num_inputs, plugin::PluginTensorRT*);
void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
quant_dynamic_range_[tensor] = range;
}
float* GetWeightCPUData(const std::string& name,
framework::Tensor* weight_tensor, bool enable_int8,
const std::vector<float>& scale = {});
// A pointer to CPU memory is needed of the TRT weight. // A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage. // Before TRT runs, fluid loads weight into GPU storage.
...@@ -184,8 +193,13 @@ class TensorRTEngine { ...@@ -184,8 +193,13 @@ class TensorRTEngine {
infer_ptr<nvinfer1::ICudaEngine> infer_engine_; infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
infer_ptr<nvinfer1::IExecutionContext> infer_context_; infer_ptr<nvinfer1::IExecutionContext> infer_context_;
infer_ptr<nvinfer1::IHostMemory> ihost_memory_; infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
}; // class TensorRTEngine }; // class TensorRTEngine
#define IS_TRT_VERSION_GE(version) \
((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
// Add an layer__ into engine__ with args ARGS. // Add an layer__ into engine__ with args ARGS.
// For example: // For example:
// //
......
...@@ -32,7 +32,7 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -32,7 +32,7 @@ struct SimpleOpTypeSetTeller : public Teller {
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu", "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"}}; "conv2d_transpose", "leaky_relu", "fc"}};
}; };
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
......
...@@ -48,6 +48,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -48,6 +48,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
int workspace_size_; int workspace_size_;
std::unique_ptr<TRTInt8Calibrator> calibrator_; std::unique_ptr<TRTInt8Calibrator> calibrator_;
bool enable_int8_; bool enable_int8_;
bool use_calib_mode_;
std::string calibration_data_; std::string calibration_data_;
std::string engine_key_; std::string engine_key_;
std::string engine_serialized_data_; std::string engine_serialized_data_;
...@@ -65,6 +66,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -65,6 +66,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
workspace_size_ = Attr<int>("workspace_size"); workspace_size_ = Attr<int>("workspace_size");
device_id_ = Attr<int>("gpu_id"); device_id_ = Attr<int>("gpu_id");
enable_int8_ = Attr<bool>("enable_int8"); enable_int8_ = Attr<bool>("enable_int8");
use_calib_mode_ = Attr<bool>("use_calib_mode");
calibration_data_ = Attr<std::string>("calibration_data"); calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key"); engine_key_ = Attr<std::string>("engine_key");
engine_serialized_data_ = Attr<std::string>("engine_serialized_data"); engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
...@@ -75,7 +77,8 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -75,7 +77,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
} }
// calibration_mode is ture represents we need to // calibration_mode is ture represents we need to
// generate the calibration table data. // generate the calibration table data.
calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); calibration_mode_ =
(enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
VLOG(4) << "calibration_mode: " << calibration_mode_; VLOG(4) << "calibration_mode: " << calibration_mode_;
if (enable_int8_ && calibration_data_.size()) { if (enable_int8_ && calibration_data_.size()) {
......
...@@ -104,6 +104,7 @@ TEST(TensorRTEngineOp, manual) { ...@@ -104,6 +104,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetAttr("engine_key", std::string("a_engine")); engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false)); engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z0"})); std::vector<std::string>({"z0"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
...@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("engine_key", std::string("b_engine")); engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false)); engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z3"})); std::vector<std::string>({"z3"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
......
...@@ -229,7 +229,7 @@ void BindAnalysisConfig(py::module *m) { ...@@ -229,7 +229,7 @@ void BindAnalysisConfig(py::module *m) {
py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
py::arg("min_subgraph_size") = 3, py::arg("min_subgraph_size") = 3,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = true) py::arg("use_static") = true, py::arg("use_calib_mode") = false)
.def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine, .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine,
py::arg("max_batch_size") = 1, py::arg("max_batch_size") = 1,
py::arg("max_input_shape") = py::arg("max_input_shape") =
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册