diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index caac973d8b89a3ff1c605d81cb07bbdcb7a63304..7e4da57e9e7dfce3051d42183a8e89ebd04bd8f0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1470,6 +1470,8 @@ USE_TRT_CONVERTER(conv3d_transpose); USE_TRT_CONVERTER(mish); USE_TRT_CONVERTER(deformable_conv); USE_TRT_CONVERTER(pool3d) +USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) +USE_TRT_CONVERTER(preln_skip_layernorm) #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 017caca6adc814af32d6045ce0510099c5935ed8..e91faedb06872a5abe38c1de77b54477e0da8ef4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -21,6 +21,8 @@ nv_library(tensorrt_converter nearest_interp_v2_op.cc pool3d_op.cc deformable_conv_op.cc + preln_emb_eltwise_layernorm.cc + preln_skip_layernorm.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..50f90de85fd0494110b86dde743428a6b1844b57 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer"; + + if (!(engine_->use_oss() && engine_->with_interleaved())) { + PADDLE_THROW(platform::errors::Fatal( + "PrelnErnie: If you want to use oss, must be with interleaved")); + } + framework::OpDesc op_desc(op, nullptr); + bool enable_int8 = op_desc.HasAttr("enable_int8"); + if (!enable_int8) { + PADDLE_THROW( + platform::errors::Fatal("use with_interleaved must be int8.")); + } + auto word_id_name = op_desc.Input("WordId").front(); + auto pos_id_name = op_desc.Input("PosId").front(); + engine_->Set("ernie_pos_name", new std::string(pos_id_name)); + + auto sent_id_name = op_desc.Input("SentId").front(); + auto word_emb_name = op_desc.Input("WordEmbedding").front(); + auto pos_emb_name = op_desc.Input("PosEmbedding").front(); + auto sent_emb_name = op_desc.Input("SentEmbedding").front(); + + std::vector id_names; + std::vector emb_names; + + id_names = + std::vector{word_id_name, pos_id_name, sent_id_name}; + emb_names = + std::vector{word_emb_name, pos_emb_name, sent_emb_name}; + + int input_num = id_names.size(); + + // Declare inputs + std::vector input_ids; + for (int i = 0; i < input_num; i++) { + input_ids.push_back(engine_->GetITensor(id_names[i])); + } + + // input_embs[0]: word_embedding + // input_embs[1]: pos_embedding + // input_embs[2]: sent_embedding + std::vector input_embs; + std::vector emb_sizes; + + // get the presistable var's data + auto get_persistable_data = [&](const std::string& var_name, + framework::DDim* dims) -> float* { + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + (*dims) = temp_tensor->dims(); + + auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false); + return temp_data; + }; + + for (int i = 0; i < input_num; i++) { + framework::DDim emb_dims; + float* emb_data = get_persistable_data(emb_names[i], &emb_dims); + int64_t emb_size = framework::product(emb_dims); + input_embs.push_back(emb_data); + emb_sizes.push_back(emb_size); + PADDLE_ENFORCE_EQ( + emb_dims.size(), 2, + platform::errors::InvalidArgument( + "The fused PrelnEmbEltwiseLayerNorm's emb should be 2 dims.")); + } + + framework::DDim bias_dims, scale_dims; + + auto* bias = + get_persistable_data(op_desc.Input("Bias").front(), &bias_dims); + auto* scale = + get_persistable_data(op_desc.Input("Scale").front(), &scale_dims); + int64_t bias_size = framework::product(bias_dims); + int64_t scale_size = framework::product(scale_dims); + int output_int8 = 1; + + PADDLE_ENFORCE_EQ( + input_num, 3, + platform::errors::InvalidArgument( + "When using oss and var-len, embedding_eltwise_layernorm op" + "should have 3 inputs only, but got %d.", + input_num)); + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back( + engine_->GetITensor(word_id_name)); // word_embedding, + // eval_placeholder_0 + plugin_inputs.emplace_back( + engine_->GetITensor(sent_id_name)); // sent_embedding, + // eval_placeholder_1 + plugin_inputs.emplace_back( + engine_->GetITensor(pos_id_name)); // cu_seqlens, + // eval_placeholder_2 + auto max_seqlen_tensor = + engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto* shuffle_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); + nvinfer1::Dims shape_dim; + shape_dim.nbDims = 1; + shape_dim.d[0] = -1; + shuffle_layer->setReshapeDimensions(shape_dim); + shuffle_layer->setName( + ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " + + op_desc.Output("Out")[0] + ")") + .c_str()); + engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f); + plugin_inputs.emplace_back( + shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "3"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " + + op_desc.Output("Out")[0] + ")") + .c_str()); + free(plugin_ptr); + float out_0_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_0_threshold")); + float out_1_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_1_threshold")); + engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_0_scale); + engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_1_scale); + + auto* shuffler_embed_out0 = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(0))); + nvinfer1::Permutation transpose_0{2, 1, 0, 3}; + shuffler_embed_out0->setSecondTranspose(transpose_0); + shuffler_embed_out0->getOutput(0)->setName( + op_desc.Output("Out_0")[0].c_str()); + engine_->SetITensor(op_desc.Output("Out_0")[0], + shuffler_embed_out0->getOutput(0)); + shuffler_embed_out0->setName( + ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_0: " + + op_desc.Output("Out_0")[0] + ")") + .c_str()); + + auto* shuffler_embed_out1 = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(1))); + nvinfer1::Permutation transpose_1{2, 1, 0, 3}; + shuffler_embed_out1->setSecondTranspose(transpose_1); + shuffler_embed_out1->getOutput(0)->setName( + op_desc.Output("Out_1")[0].c_str()); + + engine_->SetITensor(op_desc.Output("Out_1")[0], + shuffler_embed_out1->getOutput(0)); + shuffler_embed_out1->setName( + ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_1: " + + op_desc.Output("Out_1")[0] + ")") + .c_str()); + +#else + PADDLE_THROW(platform::errors::Fatal( + "PreInErnie want to use oss, must be with interleaved, " + "your TRT version is no less than 7.0")); +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fused_preln_embedding_eltwise_layernorm, + PrelnEmbEltwiseLayerNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..aa0d6fbe81376ed61992dbc6c15c69145aa98a4d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PrelnSkipLayerNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer"; + if (!(engine_->use_oss() && engine_->with_interleaved())) { + PADDLE_THROW(platform::errors::Fatal( + "PrelnErnie: If you want to use oss, must be with interleaved")); + } + framework::OpDesc op_desc(op, nullptr); + bool enable_int8 = op_desc.HasAttr("enable_int8"); + if (!enable_int8) { + PADDLE_THROW( + platform::errors::Fatal("use with_interleaved must be int8.")); + } + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + std::vector inputs; + inputs.push_back(input1); + inputs.push_back(input2); + + auto get_persistable_data = [&](const std::string& arg_name, + framework::DDim* dims) -> float* { + std::string var_name = op_desc.Input(arg_name).front(); + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + (*dims) = temp_tensor->dims(); + + auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false); + return temp_data; + }; + + framework::DDim bias_dims, scale_dims; + auto* bias = get_persistable_data("Bias", &bias_dims); + auto* scale = get_persistable_data("Scale", &scale_dims); + int bias_size = framework::product(bias_dims); + int scale_size = framework::product(scale_dims); + + nvinfer1::ILayer* layer = nullptr; + + VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved"; + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomSkipLayerNormPluginDynamic", "4"); + PADDLE_ENFORCE_NE( + creator, nullptr, + platform::errors::InvalidArgument( + "fail to get creator of CustomPrelnSkipLayerNormPluginDynamic")); + const std::vector fields{ + {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + { "gamma", + scale, + nvinfer1::PluginFieldType::kFLOAT32, + scale_size }}; + nvinfer1::PluginFieldCollection* pluginPtr = + static_cast( + malloc(sizeof(*pluginPtr) + + fields.size() * sizeof(nvinfer1::PluginField))); + pluginPtr->nbFields = static_cast(fields.size()); + pluginPtr->fields = fields.data(); + + auto pluginObj = + creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); + auto plugin_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *pluginObj); + + PADDLE_ENFORCE_NE( + plugin_layer, nullptr, + platform::errors::InvalidArgument( + "fail to add CustomPrelnSkipLayerNormPluginDynamic layer")); + layer = plugin_layer; + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name}, + test_mode); +#else + PADDLE_THROW(platform::errors::Fatal( + "PreInErnie want to use oss, must be with interleaved, " + "your TRT version is no less than 7.0")); +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(preln_skip_layernorm, PrelnSkipLayerNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index f9fc8dcb4891c99b35c59c960e97643e73105f9c..799c6c55bb121778cfe3b1a39f2dc1af315236dd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -30,24 +30,6 @@ namespace tensorrt { // Just tell by the op_types. struct SimpleOpTypeSetTeller : public Teller { SimpleOpTypeSetTeller() { -#if IS_TRT_VERSION_GE(5130) - teller_set.insert("relu6"); - teller_set.insert("hard_sigmoid"); - teller_set.insert("clip"); - int8_teller_set.insert("relu6"); - int8_teller_set.insert("hard_sigmoid"); - int8_teller_set.insert("clip"); -#endif -#if IS_TRT_VERSION_GE(6000) - teller_set.insert("fused_embedding_eltwise_layernorm"); - teller_set.insert("multihead_matmul"); - teller_set.insert("skip_layernorm"); - teller_set.insert("slice"); - int8_teller_set.insert("fused_embedding_eltwise_layernorm"); - int8_teller_set.insert("multihead_matmul"); - int8_teller_set.insert("skip_layernorm"); - int8_teller_set.insert("slice"); -#endif // TODO(baoachun) The group_norm trt plugin will check input's dim // not -1 failed when dynamic shape mode. // #if IS_TRT_VERSION_GE(7130) @@ -76,104 +58,124 @@ struct SimpleOpTypeSetTeller : public Teller { private: // use this set for no calib int8. - std::unordered_set int8_teller_set{"mul", - "matmul", - "conv2d", - "conv2d_fusion", - "pool2d", - "relu", - "softmax", - "sigmoid", - "hard_swish", - "depthwise_conv2d", - "batch_norm", - "concat", - "tanh", - "pad", - "elementwise_add", - "elementwise_mul", - "dropout", - "prelu", - "conv2d_transpose", - "depthwise_conv2d_transpose", - "leaky_relu", - "fc", - "shuffle_channel", - "swish", - "split", - "instance_norm", - "gelu", - "layer_norm", - "scale", - "stack", - "transpose2", - "transpose", - "flatten2", - "flatten", - "gather", - "gather_nd", - "yolo_box", - "roi_align", - "affine_channel", - "nearest_interp", - "anchor_generator", - "reduce_sum", - "reduce_mean", - "conv3d", - "conv3d_transpose", - "mish", - "nearest_interp_v2", - "pool3d", - "deformable_conv"}; - std::unordered_set teller_set{"mul", - "matmul", - "conv2d", - "conv2d_fusion", - "pool2d", - "relu", - "softmax", - "sigmoid", - "hard_swish", - "depthwise_conv2d", - "batch_norm", - "concat", - "tanh", - "pad", - "elementwise_add", - "elementwise_mul", - "dropout", - "prelu", - "conv2d_transpose", - "depthwise_conv2d_transpose", - "leaky_relu", - "fc", - "shuffle_channel", - "swish", - "split", - "instance_norm", - "gelu", - "layer_norm", - "scale", - "stack", - "transpose2", - "transpose", - "flatten2", - "flatten", - "gather", - "gather_nd", - "yolo_box", - "roi_align", - "affine_channel", - "nearest_interp", - "anchor_generator", - "reduce_sum", - "reduce_mean", - "conv3d", - "conv3d_transpose", - "mish", - "nearest_interp_v2", - "pool3d", - "deformable_conv"}; + std::unordered_set int8_teller_set{ + "mul", + "matmul", + "conv2d", + "conv2d_fusion", + "pool2d", + "relu", + "softmax", + "sigmoid", + "hard_swish", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "depthwise_conv2d_transpose", + "leaky_relu", + "fc", + "shuffle_channel", + "swish", + "split", + "instance_norm", + "gelu", + "layer_norm", + "scale", + "stack", + "transpose2", + "transpose", + "flatten2", + "flatten", + "gather", + "gather_nd", + "yolo_box", + "roi_align", + "affine_channel", + "nearest_interp", + "anchor_generator", + "reduce_sum", + "reduce_mean", + "conv3d", + "conv3d_transpose", + "mish", + "nearest_interp_v2", + "pool3d", + "deformable_conv", + "relu6", + "hard_sigmoid", + "clip", + "fused_embedding_eltwise_layernorm", + "multihead_matmul", + "skip_layernorm", + "slice", + "fused_preln_embedding_eltwise_layernorm", + "preln_skip_layernorm"}; + std::unordered_set teller_set{ + "mul", + "matmul", + "conv2d", + "conv2d_fusion", + "pool2d", + "relu", + "softmax", + "sigmoid", + "hard_swish", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "depthwise_conv2d_transpose", + "leaky_relu", + "fc", + "shuffle_channel", + "swish", + "split", + "instance_norm", + "gelu", + "layer_norm", + "scale", + "stack", + "transpose2", + "transpose", + "flatten2", + "flatten", + "gather", + "gather_nd", + "yolo_box", + "roi_align", + "affine_channel", + "nearest_interp", + "anchor_generator", + "reduce_sum", + "reduce_mean", + "conv3d", + "conv3d_transpose", + "mish", + "nearest_interp_v2", + "pool3d", + "deformable_conv", + "relu6", + "hard_sigmoid", + "clip", + "fused_embedding_eltwise_layernorm", + "multihead_matmul", + "skip_layernorm", + "slice", + "fused_preln_embedding_eltwise_layernorm", + "preln_skip_layernorm"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -1007,6 +1009,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "fused_preln_embedding_eltwise_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) + << "fused_preln_embedding_eltwise_layernorm should run on dynamic " + "shape mode."; + return false; + } + if (desc.Input("Ids").size() != desc.Input("Embs").size()) { + VLOG(3) << "The id and emb size of fused PrelnEmbEltwiseLayerNormOp " + "should be same "; + return false; + } + if (!desc.HasAttr("enable_int8")) { + VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode."; + return false; + } + } + if (op_type == "gelu") { if (desc.Input("X").size() != 1) { VLOG(3) << "gelu op has only 1 input, but got " @@ -1316,6 +1336,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "preln_skip_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "the preln_skip_layernorm does not support static shape yet"; + return false; + } + if (!desc.HasAttr("enable_int8")) { + VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode."; + return false; + } + } + if (op_type == "multihead_matmul") { if (!with_dynamic_shape) { VLOG(3) << "the multihead_matmul does not support static shape yet";