From 6da043ebc664cc1ed04efb4159c4b35b90f41a27 Mon Sep 17 00:00:00 2001 From: ceci3 Date: Fri, 16 Apr 2021 17:44:37 +0800 Subject: [PATCH] support ernie trt-int8 for inference (#32232) * support ernie trt-int8 for inference * fix reshape --- .../embedding_eltwise_layernorm_fuse_pass.cc | 5 ++ .../ir/multihead_matmul_fuse_pass.cc | 32 ++++++++++ .../framework/ir/skip_layernorm_fuse_pass.cc | 4 ++ .../tensorrt/convert/emb_eltwise_layernorm.cc | 6 +- .../fluid/inference/tensorrt/convert/fc_op.cc | 63 +++++++++++++++---- .../tensorrt/convert/multihead_matmul_op.cc | 49 +++++++++++++-- .../tensorrt/convert/skip_layernorm.cc | 5 ++ .../inference/tensorrt/convert/slice_op.cc | 6 ++ .../inference/tensorrt/convert/stack_op.cc | 5 ++ paddle/fluid/inference/tensorrt/op_teller.cc | 6 ++ .../slim/quantization/quantization_pass.py | 5 +- 11 files changed, 167 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 44069f61d9..59d071e103 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); new_op_desc.SetAttr("epsilon", end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + } + auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index e8f4dbd295..1e8349e878 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, multihead_op_desc.SetAttr("alpha", scale_attr); multihead_op_desc.SetAttr("head_number", head_number); + auto* mul0_op_desc = mul0->Op(); + auto* mul1_op_desc = mul1->Op(); + auto* mul2_op_desc = mul2->Op(); + if (mul0_op_desc->HasAttr("enable_int8")) { + multihead_op_desc.SetAttr("enable_int8", + mul0_op_desc->GetAttr("enable_int8")); + // all mul op has same input. + multihead_op_desc.SetAttr("Input_scale", + mul0_op_desc->GetAttr("X_scale")); + auto weight_scale0 = BOOST_GET_CONST( + std::vector, mul0_op_desc->GetAttr("weight_scale")); + auto weight_scale1 = BOOST_GET_CONST( + std::vector, mul1_op_desc->GetAttr("weight_scale")); + auto weight_scale2 = BOOST_GET_CONST( + std::vector, mul2_op_desc->GetAttr("weight_scale")); + auto weight_max = std::max(weight_scale0, weight_scale1); + weight_max = std::max(weight_max, weight_scale2); + multihead_op_desc.SetAttr("weight_scale", weight_max); + + if (mul0_op_desc->HasAttr("out_threshold")) { + auto out_scale0 = + BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold")); + auto out_scale1 = + BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold")); + auto out_scale2 = + BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold")); + auto out_scale_max = std::max(out_scale0, out_scale1); + out_scale_max = std::max(out_scale_max, out_scale2); + multihead_op_desc.SetAttr("out_threshold", out_scale_max); + } + } + auto* multihead = graph->CreateOpNode(&multihead_op_desc); IR_NODE_LINK_TO(input0, multihead); diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index 0e63320f2f..232e1d8da4 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { new_desc.SetInput("Scale", {layer_norm_scale->Name()}); new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + if (elementwise->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + } + // outputs new_desc.SetOutput("Out", {layer_norm_out->Name()}); diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 957dfe0369..f13f172454 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { #if IS_TRT_VERSION_GE(6000) - VLOG(4) << "convert fluid swish op to tensorrt layer"; + VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); auto id_names = op_desc.Input("Ids"); @@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { int64_t bias_size = framework::product(bias_dims); int64_t scale_size = framework::product(scale_dims); nvinfer1::ILayer* layer = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); if (engine_->with_dynamic_shape()) { if (engine_->use_oss()) { int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (enable_int8) { + output_fp16 = 1; + } PADDLE_ENFORCE_EQ( output_fp16, 1, platform::errors::InvalidArgument( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 527d0ee208..194d76c737 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter { auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) { - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, - n_output, weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in fc layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + nv_ksize, weight.get(), bias.get()); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, + n_output, weight.get(), bias.get()); + } auto output_name = op_desc.Output("Out").front(); if (activation_type == "relu") { @@ -229,13 +243,24 @@ class FcOpConverter : public OpConverter { "dims equals to 4, the last dim of input must be 1, but got %d", input_d[3])); } - for (int i = 0; i < 3; i++) { - if (i < input_dims) { - reshape_dim3[i] = input_d[i]; - } else { - reshape_dim3[i] = 1; + if (enable_int8) { + reshape_dim3[0] = 1; + for (int i = 0; i < 3; i++) { + reshape_dim3[0] *= input_d[i]; + if (i > 0) { + reshape_dim3[i] = 1; + } + } + } else { + for (int i = 0; i < 3; i++) { + if (i < input_dims) { + reshape_dim3[i] = input_d[i]; + } else { + reshape_dim3[i] = 1; + } } } + nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1], reshape_dim3[2]); auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); @@ -249,11 +274,25 @@ class FcOpConverter : public OpConverter { platform::errors::InvalidArgument( "Invalid dimensions. When x_num_col_dims equals to " "2, input_dims should not be 1")); - for (int i = 0; i < 4; i++) { - if (i < input_dims) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; + + if (enable_int8) { + for (int i = 0; i < 4; i++) { + if (i == 0) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + if (i < input_dims) { + reshape_dim4[1] *= input_d[i]; + } + } + } + } else { + for (int i = 0; i < 4; i++) { + if (i < input_dims) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + } } } nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1], diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 2008646549..f2f45c694a 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter { auto* bias_v = scope.FindVar(bias_name); auto* bias_t = bias_v->GetMutable(); - float* weight_data = - engine_->GetWeightCPUData(weight_name, weight_t, false); + float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); + float in_scale = 0.; + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("Input_scale"), true, + platform::errors::InvalidArgument( + "must have input scale in multihead layers in int8 mode")); + in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; + auto weight_scale = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("weight_scale")); + weight_data = + engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale); + engine_->SetTensorDynamicRange(input, in_scale); + } else { + weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false); + } + float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); @@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight, bias); + nvinfer1::ILayer* fc_layer = nullptr; + float dp_probs = 1.0 / 127.0; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n, + nv_ksize, weight, bias); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, + weight, bias); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + dp_probs = out_scale / 127.0; + } auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); @@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter { int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } bool has_mask = true; int var_seqlen = 1; const std::vector fields{ @@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter { {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}, - }; + { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }}; nvinfer1::PluginFieldCollection* plugin_collection = static_cast( malloc(sizeof(*plugin_collection) + diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 3db7709acc..b44bdcef71 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter { auto* scale = get_persistable_data("Scale", &scale_dims); int bias_size = framework::product(bias_dims); int scale_size = framework::product(scale_dims); + bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { @@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter { int ld = input1->getDimensions().d[2]; // hidden dimension assert(ld > 0); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } + const std::vector fields{ {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 38521d2564..aee39b7cf0 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(input, out_scale); + } + std::vector axes = BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); std::vector starts = diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index d538c58879..a0292b2112 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter { for (int i = 0; i < input_num; ++i) { inputs[i] = engine_->GetITensor(input[i]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(inputs[i], out_scale); + } } int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 179f534ace..53225b7978 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7130) teller_set.insert("group_norm"); + int8_teller_set.insert("multihead_matmul"); + int8_teller_set.insert("skip_layernorm"); + int8_teller_set.insert("fused_embedding_eltwise_layernorm"); + int8_teller_set.insert("matmul"); + int8_teller_set.insert("stack"); + int8_teller_set.insert("slice"); #endif } diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 79aad8c8bc..ec215a3e57 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -60,6 +60,7 @@ _out_scale_op_list = [ "swish", "softmax", "batch_norm", + "layer_norm", "elementwise_add", "pool2d", "reshape2", @@ -67,6 +68,7 @@ _out_scale_op_list = [ "concat", "elementwise_mul", "scale", + "slice", "hard_swish", "hard_sigmoid", "conv2d_transpose", @@ -119,6 +121,7 @@ _op_real_in_out_name = { "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], "batch_norm": [["X"], ["Y"]], + "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], @@ -1749,7 +1752,7 @@ class AddQuantDequantPass(object): "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm" + "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm" ] # To be compatible with PaddleSlim, not remove _activation_type for now -- GitLab