diff --git a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc index 4f61b0301081e9e3acb4aa180ebb974cff543ec0..3ff91e0bcb76c13457e47a7cde3066afbdf23baf 100644 --- a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc +++ b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc @@ -56,6 +56,22 @@ namespace paddle { namespace framework { namespace ir { +bool HasScale(OpDesc* const op_ptr, + std::string* name, + std::string regexp = "Input_scale_") { + name->clear(); + std::unordered_map attr_map = op_ptr->GetAttrMap(); + std::unordered_map::iterator iter; + int len = regexp.size(); + for (iter = attr_map.begin(); iter != attr_map.end(); iter++) { + if (regexp == iter->first.substr(0, len)) { + *name = iter->first; + return true; + } + } + return false; +} + void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const { GraphPatternDetector gpd; const std::string pattern_name = "vit_attention_fuse"; @@ -103,6 +119,16 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const { float alpha = PADDLE_GET_CONST(float, scale1_op->Op()->GetAttr("scale")); desc.SetAttr("alpha", alpha); + // int8 for fc + std::string scale_name; + if (HasScale(matmul0_op->Op(), &scale_name)) { + desc.SetAttr("Input_scale", matmul0_op->Op()->GetAttr(scale_name)); + } + if (HasScale(elementwise0_op->Op(), &scale_name, "Out")) { + desc.SetAttr("fc_out_threshold", + elementwise0_op->Op()->GetAttr(scale_name)); + } + // Create a new node for the fused op. auto vit_attention_node = graph->CreateOpNode(&desc); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index f997c8bd1f864bf20982cf51cbcb4287948863e1..0515cb513d007daa61f63c63c7ce69a314d32491 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -398,13 +398,37 @@ class MultiheadMatMulOpConverter : public OpConverter { // add fc layer nvinfer1::ILayer* fc_layer = nullptr; - fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, - FullyConnected, - *reshape_before_fc_layer->getOutput(0), - n, - weight, - bias); + if (op_desc.HasAttr("Input_scale")) { + engine_->SetTensorDynamicRange( + reshape_before_fc_layer->getOutput(0), in_scale); + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Convolution, + *reshape_before_fc_layer->getOutput(0), + n, + nv_ksize, + weight, + bias); + PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers " + "in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } else { + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + FullyConnected, + *reshape_before_fc_layer->getOutput(0), + n, + weight, + bias); + } + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); // add shuffle for CustomQKVToContextPluginDynamic layer auto* reshape_after_fc_layer = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index 6889f88fa4c95ff37011d16990feb69473625d54..fa1cb51e7f9693fccdf298012b20d082b5bf7222 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -818,7 +818,11 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest): "Y": ["matmul1_weight"], }, "op_outputs": {"Out": ["matmul1_output"]}, - "op_attrs": {"trans_x": False, "trans_y": False}, + "op_attrs": { + "trans_x": False, + "trans_y": False, + "Input_scale_layer": 1.0, + }, }, { "op_type": "elementwise_add", @@ -832,6 +836,7 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest): "Scale_x": 1.0, "Scale_y": 1.0, "axis": 2, + "Out": 1.0, }, }, { @@ -1035,6 +1040,11 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.workspace_size = 2013265920 + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num(), ( + 1e-3, + 1e-3, + ) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num(), ( 1e-3,