未验证 提交 65f70525 编写于 作者: Z Zhaolong Xing 提交者: GitHub

TRT int8: refine trt int8 for dynamic range set (#21112)

* refine trt int8 for dynamic range set
test=develop

* refine trt int8
test=develop
上级 56b5d147
......@@ -39,6 +39,7 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
pattern_name);
pattern();
auto* scope = param_scope();
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
......@@ -47,10 +48,29 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
std::string any_op_out_name = any_op_out->Var()->Name();
std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
std::string input_scale_var_name =
quant_dequant_op->Op()->Input("InScale").front();
const LoDTensor& input_scale_tensor =
scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
const float* input_scale_data = input_scale_tensor.data<float>();
float input_scale = input_scale_data[0];
auto* any_op2_desc = any_op2->Op();
// auto input_args_names = any_op2_desc->InputArgumentNames();
auto var_map = any_op2_desc->Inputs();
std::string arg_name = "";
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(), name_m.second.end(),
quant_dequant_op_out_name) != name_m.second.end()) {
arg_name = name_m.first;
}
}
CHECK(arg_name.size() > 0) << "can not find the input "
<< quant_dequant_op_out_name;
any_op2_desc->SetAttr("enable_int8", true);
any_op2_desc->SetAttr(arg_name + "_scale", input_scale);
// modify the any_op2's inputs
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(), name_m.second.end(),
quant_dequant_op_out_name) != name_m.second.end()) {
......@@ -65,6 +85,7 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
any_op2_desc->Flush();
}
}
any_op2_desc->Flush();
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph,
{quant_dequant_op, quant_dequant_op_out,
......
......@@ -99,7 +99,7 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
auto* mul_op_desc = mul->Op();
if (mul_op_desc->HasAttr("enable_int8")) {
desc.SetAttr("enable_int8", mul_op_desc->GetAttr("enable_int8"));
desc.SetAttr("input_scale", mul_op_desc->GetAttr("input_scale"));
desc.SetAttr("Input_scale", mul_op_desc->GetAttr("X_scale"));
desc.SetAttr("weight_scale", mul_op_desc->GetAttr("weight_scale"));
if (mul_op_desc->HasAttr("out_scale"))
desc.SetAttr("out_scale", mul_op_desc->GetAttr("out_scale"));
......
......@@ -140,22 +140,24 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
framework::OpDesc new_op_desc(base_op_desc, nullptr);
new_op_desc.SetType(quantized_op_type);
new_op_desc.SetAttr("enable_int8", true);
if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetAttr("Input_scale", input_scale);
new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetAttr("Input_scale", input_scale);
new_op_desc.SetOutput("Out", {new_output});
} else if (quantized_op_type == "mul") {
new_op_desc.SetInput("X", {new_input});
new_op_desc.SetAttr("X_scale", input_scale);
new_op_desc.SetOutput("Out", {new_output});
}
new_op_desc.SetAttr("enable_int8", true);
new_op_desc.SetAttr("input_scale", input_scale);
new_op_desc.SetAttr("weight_scale", weight_scale);
new_op_desc.Flush();
auto* new_op = graph->CreateOpNode(&new_op_desc);
......
......@@ -76,7 +76,8 @@ const std::vector<std::string> kTRTSubgraphPasses({
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_quant_dequant_op_pass", //
// "fc_fuse_pass", //
"conv_bn_fuse_pass", //
"fc_fuse_pass", //
"tensorrt_subgraph_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
......
......@@ -40,7 +40,8 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
CHECK(op_desc.HasAttr("Input_scale"));
float in_scale = boost::get<float>(op_desc.GetAttr("Input_scale"));
auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t,
......@@ -89,13 +90,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0));
#if IS_TRT_VERSION_GE(5000)
if (enable_int8) {
float output_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine->SetTensorDynamicRange(layer->getOutput(0), output_scale);
}
#endif
if (test_mode) {
engine->DeclareOutput(output_name);
}
......
......@@ -110,10 +110,11 @@ class ElementwiseWeightOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
test_mode);
if (op_desc.HasAttr("out_scale")) {
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
CHECK(op_desc.HasAttr("X_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
#endif
}
}
......@@ -169,10 +170,14 @@ class ElementwiseTensorOpConverter : public OpConverter {
layer = plugin_layer;
}
RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
if (op_desc.HasAttr("out_scale")) {
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
CHECK(op_desc.HasAttr("X_scale"));
CHECK(op_desc.HasAttr("Y_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
float y_scale = boost::get<float>(op_desc.GetAttr("Y_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
engine_->SetTensorDynamicRange(Y, y_scale);
#endif
}
}
......
......@@ -77,7 +77,8 @@ class FcOpConverter : public OpConverter {
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
CHECK(op_desc.HasAttr(i_name + "_scale"));
float in_scale = boost::get<float>(op_desc.GetAttr(i_name + "_scale"));
auto weight_scale =
boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
......@@ -135,12 +136,6 @@ class FcOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out").front();
RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif
}
}
};
......
......@@ -42,6 +42,13 @@ class LeakyReluOpConverter : public OpConverter {
engine_, Activation, *input, nvinfer1::ActivationType::kLEAKY_RELU);
layer->setAlpha(alpha);
output_layer = layer;
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
CHECK(op_desc.HasAttr("X_scale"));
float in_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
engine_->SetTensorDynamicRange(input, in_scale);
}
#else
platform::CPUPlace place;
std::unique_ptr<framework::LoDTensor> alpha_tensor(
......
......@@ -160,10 +160,11 @@ class Pool2dOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
if (op_desc.HasAttr("out_scale")) {
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
CHECK(op_desc.HasAttr("X_scale"));
float input_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
engine_->SetTensorDynamicRange(input1, input_scale);
#endif
}
}
......
......@@ -104,12 +104,31 @@ void TensorRTEngine::FreezeNetwork() {
for (auto &t : all_t) {
if (!quant_dynamic_range_.count(t)) {
LOG(WARNING)
VLOG(3)
<< "We are in trt int8 mode(not calibration), scale not setted"
<< " for tensor " << t->getName()
<< ", this might be ok when trt does not need this range";
}
}
std::unordered_set<std::string> all_out_t_name;
for (int i = 0; i < infer_network_->getNbOutputs(); i++) {
auto *temp = infer_network_->getOutput(i);
temp->setDynamicRange(-1, 1);
all_out_t_name.insert(temp->getName());
}
for (int i = 0; i < infer_network_->getNbLayers(); i++) {
auto layer = infer_network_->getLayer(i);
for (int j = 0; j < layer->getNbOutputs(); j++) {
auto *temp_out = layer->getOutput(j);
if (std::find(all_out_t_name.begin(), all_out_t_name.end(),
temp_out->getName()) != all_out_t_name.end()) {
layer->setPrecision(nvinfer1::DataType::kFLOAT);
layer->setOutputType(j, nvinfer1::DataType::kFLOAT);
}
}
}
#endif
}
}
......@@ -214,11 +233,6 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
(scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
for (int i = 0; i < weight_tensor->numel(); i++) {
bool is_valid_int8 =
((weight_data[i] >= -128) && (weight_data[i] <= 127));
PADDLE_ENFORCE(is_valid_int8,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]");
if (scale.size() == 1) {
weight_data[i] *= (scale[0] / 127);
} else {
......
......@@ -56,6 +56,10 @@ struct SimpleOpTypeSetTeller : public Teller {
};
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
// do not support the op which is labeled the `skip_quant`
if (desc.HasAttr("op_namescope") &&
boost::get<std::string>(desc.GetAttr("op_namescope")) == "/skip_quant_2/")
return false;
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
......
......@@ -303,6 +303,14 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
endif()
inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
endif()
set(CAPI_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/capi_tests_models")
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle {
namespace inference {
TEST(quant_int8, resnet50) {
std::string model_dir = FLAGS_infer_model;
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.SwitchUseFeedFetchOps(false);
config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8,
false, false);
auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames();
int channels = 1;
int height = 3;
int width = 3;
int input_num = channels * height * width * 1;
float *input = new float[input_num];
memset(input, 0, input_num * sizeof(float));
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({1, channels, height, width});
input_t->copy_from_cpu(input);
ASSERT_TRUE(predictor->ZeroCopyRun());
}
} // namespace inference
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册