[Paddle-Inference] support preln-ernie: add preln_emb_eltwise_layernorm_op,...

[Paddle-Inference] support preln-ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op (#39570) * support preln_ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op * support preln_ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op

[Paddle-Inference] support preln-ernie: add preln_emb_eltwise_layernorm_op,...
[Paddle-Inference] support preln-ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op (#39570) * support preln_ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op * support preln_ernie: add preln_emb_eltwise_layernorm_op, preln_skip_layernorm_op
f31c2426 · Wangzheee · GitHub · ae92da87 · f31c2426 · f31c2426
5 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1470,6 +1470,8 @@ USE_TRT_CONVERTER(conv3d_transpose);
 USE_TRT_CONVERTER(mish);
 USE_TRT_CONVERTER(deformable_conv);
 USE_TRT_CONVERTER(pool3d)
+USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
+USE_TRT_CONVERTER(preln_skip_layernorm)
 #endif

 namespace paddle_infer {

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -21,6 +21,8 @@ nv_library(tensorrt_converter
                nearest_interp_v2_op.cc
                pool3d_op.cc
                deformable_conv_op.cc
+                preln_emb_eltwise_layernorm.cc
+                preln_skip_layernorm.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS

--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(7000)
+    VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer";
+
+    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PrelnErnie: If you want to use oss, must be with interleaved"));
+    }
+    framework::OpDesc op_desc(op, nullptr);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    if (!enable_int8) {
+      PADDLE_THROW(
+          platform::errors::Fatal("use with_interleaved must be int8."));
+    }
+    auto word_id_name = op_desc.Input("WordId").front();
+    auto pos_id_name = op_desc.Input("PosId").front();
+    engine_->Set("ernie_pos_name", new std::string(pos_id_name));
+
+    auto sent_id_name = op_desc.Input("SentId").front();
+    auto word_emb_name = op_desc.Input("WordEmbedding").front();
+    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
+    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
+
+    std::vector<std::string> id_names;
+    std::vector<std::string> emb_names;
+
+    id_names =
+        std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+    emb_names =
+        std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+
+    int input_num = id_names.size();
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> input_ids;
+    for (int i = 0; i < input_num; i++) {
+      input_ids.push_back(engine_->GetITensor(id_names[i]));
+    }
+
+    // input_embs[0]: word_embedding
+    // input_embs[1]: pos_embedding
+    // input_embs[2]: sent_embedding
+    std::vector<float*> input_embs;
+    std::vector<int> emb_sizes;
+
+    // get the presistable var's data
+    auto get_persistable_data = [&](const std::string& var_name,
+                                    framework::DDim* dims) -> float* {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    for (int i = 0; i < input_num; i++) {
+      framework::DDim emb_dims;
+      float* emb_data = get_persistable_data(emb_names[i], &emb_dims);
+      int64_t emb_size = framework::product(emb_dims);
+      input_embs.push_back(emb_data);
+      emb_sizes.push_back(emb_size);
+      PADDLE_ENFORCE_EQ(
+          emb_dims.size(), 2,
+          platform::errors::InvalidArgument(
+              "The fused PrelnEmbEltwiseLayerNorm's emb should be 2 dims."));
+    }
+
+    framework::DDim bias_dims, scale_dims;
+
+    auto* bias =
+        get_persistable_data(op_desc.Input("Bias").front(), &bias_dims);
+    auto* scale =
+        get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
+    int64_t bias_size = framework::product(bias_dims);
+    int64_t scale_size = framework::product(scale_dims);
+    int output_int8 = 1;
+
+    PADDLE_ENFORCE_EQ(
+        input_num, 3,
+        platform::errors::InvalidArgument(
+            "When using oss and var-len, embedding_eltwise_layernorm op"
+            "should have 3 inputs only, but got %d.",
+            input_num));
+    const std::vector<nvinfer1::PluginField> fields{
+        {"bert_embeddings_layernorm_beta", bias,
+         nvinfer1::PluginFieldType::kFLOAT32, static_cast<int32_t>(bias_size)},
+        {"bert_embeddings_layernorm_gamma", scale,
+         nvinfer1::PluginFieldType::kFLOAT32, static_cast<int32_t>(scale_size)},
+        {"bert_embeddings_word_embeddings", input_embs[0],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[0])},
+        {"bert_embeddings_token_type_embeddings", input_embs[2],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[2])},
+        {"bert_embeddings_position_embeddings", input_embs[1],
+         nvinfer1::PluginFieldType::kFLOAT32,
+         static_cast<int32_t>(emb_sizes[1])},
+        {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+
+    nvinfer1::PluginFieldCollection* plugin_ptr =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_ptr) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_ptr->nbFields = static_cast<int>(fields.size());
+    plugin_ptr->fields = fields.data();
+
+    std::vector<nvinfer1::ITensor*> plugin_inputs;
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(word_id_name));  // word_embedding,
+                                             // eval_placeholder_0
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(sent_id_name));  // sent_embedding,
+                                             // eval_placeholder_1
+    plugin_inputs.emplace_back(
+        engine_->GetITensor(pos_id_name));  // cu_seqlens,
+                                            // eval_placeholder_2
+    auto max_seqlen_tensor =
+        engine_->GetITensor(engine_->network()->getInput(3)->getName());
+    auto* shuffle_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+    nvinfer1::Dims shape_dim;
+    shape_dim.nbDims = 1;
+    shape_dim.d[0] = -1;
+    shuffle_layer->setReshapeDimensions(shape_dim);
+    shuffle_layer->setName(
+        ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
+         op_desc.Output("Out")[0] + ")")
+            .c_str());
+    engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+    plugin_inputs.emplace_back(
+        shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+    auto creator = GetPluginRegistry()->getPluginCreator(
+        "CustomEmbLayerNormPluginDynamic", "3");
+
+    auto plugin_obj =
+        creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+    auto plugin_layer = engine_->network()->addPluginV2(
+        plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+    plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " +
+                           op_desc.Output("Out")[0] + ")")
+                              .c_str());
+    free(plugin_ptr);
+    float out_0_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("out_0_threshold"));
+    float out_1_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("out_1_threshold"));
+    engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_0_scale);
+    engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_1_scale);
+
+    auto* shuffler_embed_out0 =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(0)));
+    nvinfer1::Permutation transpose_0{2, 1, 0, 3};
+    shuffler_embed_out0->setSecondTranspose(transpose_0);
+    shuffler_embed_out0->getOutput(0)->setName(
+        op_desc.Output("Out_0")[0].c_str());
+    engine_->SetITensor(op_desc.Output("Out_0")[0],
+                        shuffler_embed_out0->getOutput(0));
+    shuffler_embed_out0->setName(
+        ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_0: " +
+         op_desc.Output("Out_0")[0] + ")")
+            .c_str());
+
+    auto* shuffler_embed_out1 =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(plugin_layer->getOutput(1)));
+    nvinfer1::Permutation transpose_1{2, 1, 0, 3};
+    shuffler_embed_out1->setSecondTranspose(transpose_1);
+    shuffler_embed_out1->getOutput(0)->setName(
+        op_desc.Output("Out_1")[0].c_str());
+
+    engine_->SetITensor(op_desc.Output("Out_1")[0],
+                        shuffler_embed_out1->getOutput(0));
+    shuffler_embed_out1->setName(
+        ("shuffler_after_CustomPrelnEmbLayerNormPluginDynamic_V3(Output_1: " +
+         op_desc.Output("Out_1")[0] + ")")
+            .c_str());
+
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "PreInErnie want to use oss, must be with interleaved, "
+        "your TRT version is no less than 7.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fused_preln_embedding_eltwise_layernorm,
+                          PrelnEmbEltwiseLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PrelnSkipLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+#if IS_TRT_VERSION_GE(7000)
+    VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer";
+    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PrelnErnie: If you want to use oss, must be with interleaved"));
+    }
+    framework::OpDesc op_desc(op, nullptr);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    if (!enable_int8) {
+      PADDLE_THROW(
+          platform::errors::Fatal("use with_interleaved must be int8."));
+    }
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    std::vector<nvinfer1::ITensor*> inputs;
+    inputs.push_back(input1);
+    inputs.push_back(input2);
+
+    auto get_persistable_data = [&](const std::string& arg_name,
+                                    framework::DDim* dims) -> float* {
+      std::string var_name = op_desc.Input(arg_name).front();
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      (*dims) = temp_tensor->dims();
+
+      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor, false);
+      return temp_data;
+    };
+
+    framework::DDim bias_dims, scale_dims;
+    auto* bias = get_persistable_data("Bias", &bias_dims);
+    auto* scale = get_persistable_data("Scale", &scale_dims);
+    int bias_size = framework::product(bias_dims);
+    int scale_size = framework::product(scale_dims);
+
+    nvinfer1::ILayer* layer = nullptr;
+
+    VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved";
+
+    auto creator = GetPluginRegistry()->getPluginCreator(
+        "CustomSkipLayerNormPluginDynamic", "4");
+    PADDLE_ENFORCE_NE(
+        creator, nullptr,
+        platform::errors::InvalidArgument(
+            "fail to get creator of CustomPrelnSkipLayerNormPluginDynamic"));
+    const std::vector<nvinfer1::PluginField> fields{
+        {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+        { "gamma",
+          scale,
+          nvinfer1::PluginFieldType::kFLOAT32,
+          scale_size }};
+    nvinfer1::PluginFieldCollection* pluginPtr =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*pluginPtr) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    pluginPtr->nbFields = static_cast<int>(fields.size());
+    pluginPtr->fields = fields.data();
+
+    auto pluginObj =
+        creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+    auto plugin_layer = engine_->network()->addPluginV2(
+        inputs.data(), inputs.size(), *pluginObj);
+
+    PADDLE_ENFORCE_NE(
+        plugin_layer, nullptr,
+        platform::errors::InvalidArgument(
+            "fail to add CustomPrelnSkipLayerNormPluginDynamic layer"));
+    layer = plugin_layer;
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name},
+                             test_mode);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "PreInErnie want to use oss, must be with interleaved, "
+        "your TRT version is no less than 7.0"));
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(preln_skip_layernorm, PrelnSkipLayerNormOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -30,24 +30,6 @@ namespace tensorrt {
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
  SimpleOpTypeSetTeller() {
-#if IS_TRT_VERSION_GE(5130)
-    teller_set.insert("relu6");
-    teller_set.insert("hard_sigmoid");
-    teller_set.insert("clip");
-    int8_teller_set.insert("relu6");
-    int8_teller_set.insert("hard_sigmoid");
-    int8_teller_set.insert("clip");
-#endif
-#if IS_TRT_VERSION_GE(6000)
-    teller_set.insert("fused_embedding_eltwise_layernorm");
-    teller_set.insert("multihead_matmul");
-    teller_set.insert("skip_layernorm");
-    teller_set.insert("slice");
-    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
-    int8_teller_set.insert("multihead_matmul");
-    int8_teller_set.insert("skip_layernorm");
-    int8_teller_set.insert("slice");
-#endif
 // TODO(baoachun) The group_norm trt plugin will check input's dim
 // not -1 failed when dynamic shape mode.
 // #if IS_TRT_VERSION_GE(7130)
@@ -76,104 +58,124 @@ struct SimpleOpTypeSetTeller : public Teller {

 private:
  // use this set for no calib int8.
-  std::unordered_set<std::string> int8_teller_set{"mul",
-                                                  "matmul",
-                                                  "conv2d",
-                                                  "conv2d_fusion",
-                                                  "pool2d",
-                                                  "relu",
-                                                  "softmax",
-                                                  "sigmoid",
-                                                  "hard_swish",
-                                                  "depthwise_conv2d",
-                                                  "batch_norm",
-                                                  "concat",
-                                                  "tanh",
-                                                  "pad",
-                                                  "elementwise_add",
-                                                  "elementwise_mul",
-                                                  "dropout",
-                                                  "prelu",
-                                                  "conv2d_transpose",
-                                                  "depthwise_conv2d_transpose",
-                                                  "leaky_relu",
-                                                  "fc",
-                                                  "shuffle_channel",
-                                                  "swish",
-                                                  "split",
-                                                  "instance_norm",
-                                                  "gelu",
-                                                  "layer_norm",
-                                                  "scale",
-                                                  "stack",
-                                                  "transpose2",
-                                                  "transpose",
-                                                  "flatten2",
-                                                  "flatten",
-                                                  "gather",
-                                                  "gather_nd",
-                                                  "yolo_box",
-                                                  "roi_align",
-                                                  "affine_channel",
-                                                  "nearest_interp",
-                                                  "anchor_generator",
-                                                  "reduce_sum",
-                                                  "reduce_mean",
-                                                  "conv3d",
-                                                  "conv3d_transpose",
-                                                  "mish",
-                                                  "nearest_interp_v2",
-                                                  "pool3d",
-                                                  "deformable_conv"};
-  std::unordered_set<std::string> teller_set{"mul",
-                                             "matmul",
-                                             "conv2d",
-                                             "conv2d_fusion",
-                                             "pool2d",
-                                             "relu",
-                                             "softmax",
-                                             "sigmoid",
-                                             "hard_swish",
-                                             "depthwise_conv2d",
-                                             "batch_norm",
-                                             "concat",
-                                             "tanh",
-                                             "pad",
-                                             "elementwise_add",
-                                             "elementwise_mul",
-                                             "dropout",
-                                             "prelu",
-                                             "conv2d_transpose",
-                                             "depthwise_conv2d_transpose",
-                                             "leaky_relu",
-                                             "fc",
-                                             "shuffle_channel",
-                                             "swish",
-                                             "split",
-                                             "instance_norm",
-                                             "gelu",
-                                             "layer_norm",
-                                             "scale",
-                                             "stack",
-                                             "transpose2",
-                                             "transpose",
-                                             "flatten2",
-                                             "flatten",
-                                             "gather",
-                                             "gather_nd",
-                                             "yolo_box",
-                                             "roi_align",
-                                             "affine_channel",
-                                             "nearest_interp",
-                                             "anchor_generator",
-                                             "reduce_sum",
-                                             "reduce_mean",
-                                             "conv3d",
-                                             "conv3d_transpose",
-                                             "mish",
-                                             "nearest_interp_v2",
-                                             "pool3d",
-                                             "deformable_conv"};
+  std::unordered_set<std::string> int8_teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm"};
+  std::unordered_set<std::string> teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm"};
 };

 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
@@ -1007,6 +1009,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
      }
    }

+    if (op_type == "fused_preln_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "fused_preln_embedding_eltwise_layernorm should run on dynamic "
+               "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused PrelnEmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+      if (!desc.HasAttr("enable_int8")) {
+        VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode.";
+        return false;
+      }
+    }
+
    if (op_type == "gelu") {
      if (desc.Input("X").size() != 1) {
        VLOG(3) << "gelu op has only 1 input, but got "
@@ -1316,6 +1336,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
      }
    }

+    if (op_type == "preln_skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the preln_skip_layernorm does not support static shape yet";
+        return false;
+      }
+      if (!desc.HasAttr("enable_int8")) {
+        VLOG(3) << "PrelnEmbEltwiseLayerNormOp must use int8 mode.";
+        return false;
+      }
+    }
+
    if (op_type == "multihead_matmul") {
      if (!with_dynamic_shape) {
        VLOG(3) << "the multihead_matmul does not support static shape yet";