fused_embedding_eltwise_layernorm_op and skip_layernorm_op support fp16 (#44969)

ac0553a0 · Yuanle Liu · GitHub · 3512bf11 · ac0553a0 · ac0553a0
17 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -166,7 +166,6 @@ if(WITH_TENSORRT)
  pass_library(trt_map_matmul_to_mul_pass inference)
  pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
  pass_library(trt_multihead_matmul_fuse_pass inference)
-  pass_library(trt_skip_layernorm_fuse_pass inference)
  pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
  pass_library(preln_skip_layernorm_fuse_pass inference)
  pass_library(set_transformer_input_convert_pass inference)
@@ -177,6 +176,7 @@ endif()
 if(WITH_GPU OR WITH_ROCM)
  pass_library(cudnn_placement_pass base DEPS placement_pass_base)
  pass_library(embedding_eltwise_layernorm_fuse_pass inference)
+  pass_library(trt_skip_layernorm_fuse_pass inference)
 endif()

 if(WITH_MKLDNN)

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -165,12 +165,17 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
    "gpu_cpu_map_matmul_v2_to_matmul_pass",
    "fc_fuse_pass",
    "fc_elementwise_layernorm_fuse_pass",
+    "embedding_eltwise_layernorm_fuse_pass",
+    "trt_skip_layernorm_fuse_pass",
+    "runtime_context_cache_pass",
 };

 const std::vector<std::string> kTrtLowerPrecisionPasses{
    "simplify_with_basic_ops_pass",
    // "conv_bn_fuse_pass",
    // "conv_eltwiseadd_bn_fuse_pass",
+    "trt_embedding_eltwise_layernorm_fuse_pass",
+    "trt_skip_layernorm_fuse_pass",
    "trt_map_matmul_v2_to_mul_pass",
    "trt_map_matmul_v2_to_matmul_pass",
    "trt_map_matmul_to_mul_pass",
@@ -186,6 +191,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_bn_fuse_pass",                      //
        "conv_eltwiseadd_bn_fuse_pass",           //
        "embedding_eltwise_layernorm_fuse_pass",  //
+        "trt_skip_layernorm_fuse_pass",           //
        "multihead_matmul_fuse_pass_v2",          //
        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
        "gpu_cpu_reshape2_matmul_fuse_pass",      //

--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -133,6 +133,15 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
      return weight;
    };

+    auto GetFp16Weight = [&](const std::string& var_name,
+                             framework::DDim* dim) -> TensorRTEngine::Weight {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      *dim = temp_tensor->dims();
+      auto weight = engine_->GetFp16TrtWeight(var_name, *temp_tensor);
+      return weight;
+    };
+
    auto GetFp32Weight = [&](const std::string& var_name,
                             framework::DDim* dim) -> TensorRTEngine::Weight {
      auto* temp_var = scope.FindVar(var_name);
@@ -141,7 +150,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
      auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor);
      return weight;
    };
-
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
    int hidden = 0;
    for (int i = 0; i < input_num; i++) {
      framework::DDim emb_dims;
@@ -149,7 +158,11 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
      if (flag_varseqlen) {
        weight = GetWeight(emb_names[i], &emb_dims);
      } else {
-        weight = GetFp32Weight(emb_names[i], &emb_dims);
+        if (with_fp16) {
+          weight = GetFp16Weight(emb_names[i], &emb_dims);
+        } else {
+          weight = GetFp32Weight(emb_names[i], &emb_dims);
+        }
      }
      input_embs.push_back(weight.get());
      emb_sizes.push_back(weight.get().count);
@@ -167,8 +180,15 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
      bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
      scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
    } else {
-      bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims);
-      scale_weight = GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims);
+      if (with_fp16) {
+        bias_weight = GetFp16Weight(op_desc.Input("Bias").front(), &bias_dims);
+        scale_weight =
+            GetFp16Weight(op_desc.Input("Scale").front(), &scale_dims);
+      } else {
+        bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims);
+        scale_weight =
+            GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims);
+      }
    }

    int64_t bias_size = phi::product(bias_dims);
@@ -282,21 +302,18 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
                                 test_mode);
      }
    } else {
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
      float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
      plugin::DynamicPluginTensorRT* plugin = nullptr;
-      std::vector<float*> input_embs_data;
+      std::vector<void*> input_embs_data;
      for (size_t i = 0; i < input_embs.size(); ++i) {
-        input_embs_data.push_back(const_cast<float*>(
-            static_cast<const float*>(input_embs[i].values)));
+        input_embs_data.push_back(const_cast<void*>(
+            reinterpret_cast<const void*>(input_embs[i].values)));
      }
      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
          input_embs_data,
-          const_cast<float*>(
-              static_cast<const float*>(bias_weight.get().values)),
-          const_cast<float*>(
-              static_cast<const float*>(scale_weight.get().values)),
+          const_cast<void*>(static_cast<const void*>(bias_weight.get().values)),
+          const_cast<void*>(
+              static_cast<const void*>(scale_weight.get().values)),
          emb_sizes,
          bias_size,
          scale_size,

--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -150,6 +150,15 @@ class SkipLayerNormOpConverter : public OpConverter {
        layer = plugin_layer;
      }
    } else {
+      auto GetFp16Weight =
+          [&](const std::string& arg_name) -> TensorRTEngine::Weight {
+        std::string var_name = op_desc.Input(arg_name).front();
+        auto* temp_var = scope.FindVar(var_name);
+        auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+        auto weight = engine_->GetFp16TrtWeight(var_name, *temp_tensor);
+        return weight;
+      };
+
      auto GetFp32Weight =
          [&](const std::string& arg_name) -> TensorRTEngine::Weight {
        std::string var_name = op_desc.Input(arg_name).front();
@@ -159,20 +168,29 @@ class SkipLayerNormOpConverter : public OpConverter {
        return weight;
      };

-      auto bias_weight = GetFp32Weight("Bias").get();
-      auto scale_weight = GetFp32Weight("Scale").get();
+      // bool with_fp16 = engine_->WithFp16() &&
+      //                  !engine_->disable_trt_plugin_fp16() &&
+      //                  (input1->getType() == nvinfer1::DataType::kHALF);
+      bool with_fp16 = false;
+      TensorRTEngine::Weight bias_weight, scale_weight;
+      if (with_fp16) {
+        bias_weight = GetFp16Weight("Bias");
+        scale_weight = GetFp16Weight("Scale");
+      } else {
+        bias_weight = GetFp32Weight("Bias");
+        scale_weight = GetFp32Weight("Scale");
+      }

      float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
-      // bool with_fp16 =
-      //     engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      bool with_fp16 = false;

      plugin::SkipLayerNormPluginDynamic* plugin =
          new plugin::SkipLayerNormPluginDynamic(
-              static_cast<const float*>(bias_weight.values),
-              static_cast<const float*>(scale_weight.values),
-              bias_weight.count,
-              scale_weight.count,
+              const_cast<void*>(
+                  static_cast<const void*>(bias_weight.get().values)),
+              const_cast<void*>(
+                  static_cast<const void*>(scale_weight.get().values)),
+              bias_weight.get().count,
+              scale_weight.get().count,
              eps,
              with_fp16);
      layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -31,7 +31,7 @@ namespace inference {
 namespace tensorrt {

 void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
-  nvinfer1::DataType nv_type;
+  nvinfer1::DataType nv_type = nvinfer1::DataType::kFLOAT;
  switch (type) {
    case phi::DataType::FLOAT32:
      nv_type = nvinfer1::DataType::kFLOAT;
@@ -455,6 +455,67 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
 }

+// Note: Only for support plugin.
+TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
+    const std::string &name, const framework::Tensor &weight_tensor) {
+  static int name_suffix_counter = 0;
+  std::string name_suffix = std::to_string(name_suffix_counter);
+  std::string splitter = "__";
+  std::string name_with_suffix = name + splitter + name_suffix;
+  platform::CPUPlace cpu_place;
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
+                    0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
+  weight_map[name_with_suffix].reset(new framework::Tensor());
+  weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+
+  TensorRTEngine::Weight weight;
+  weight.SetCount(weight_tensor.numel());
+  weight.SetDataType(nvinfer1::DataType::kHALF);
+  // weight_tensor.dims().;
+
+  // if trt not support dtype, we need to cast to  fp16.
+  if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
+    framework::Tensor bf16_tensor;
+    bf16_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &bf16_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::FLOAT16);
+    weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+    auto *fp16_data = weight_map[name_with_suffix]->mutable_data<float16>(
+        platform::CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      fp16_data[i] = static_cast<float16>(bf16_data[i]);
+    }
+  } else if (weight_tensor.dtype() == phi::DataType::FLOAT32) {
+    framework::Tensor fp32_tensor;
+    fp32_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &fp32_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::FLOAT16);
+    weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+    auto *fp16_data = weight_map[name_with_suffix]->mutable_data<float16>(
+        platform::CPUPlace());
+    auto *fp32_data = fp32_tensor.mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      fp16_data[i] = static_cast<float16>(fp32_data[i]);
+    }
+  } else {
+    paddle::framework::TensorCopySync(
+        weight_tensor, cpu_place, weight_map[name_with_suffix].get());
+  }
+  weight.SetValues(weight_map[name_with_suffix]->data());
+  name_suffix_counter += 1;
+  return weight;
+}
+
+// Note: Only for support plugin.
 TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
    const std::string &name, const framework::Tensor &weight_tensor) {
  static int name_suffix_counter = 0;

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -421,6 +421,10 @@ class TensorRTEngine {
    quant_dynamic_range_[tensor] = range;
  }

+  // Get fp16 trt weight. If src weight is not fp16, we will cast.
+  Weight GetFp16TrtWeight(const std::string& name,
+                          const framework::Tensor& weight_tensor);
+
  // Get fp32 trt weight. If src weight is not fp32, we will cast.
  Weight GetFp32TrtWeight(const std::string& name,
                          const framework::Tensor& weight_tensor);

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -16,6 +16,7 @@

 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
+#include <type_traits>
 #include <vector>

 #include "glog/logging.h"
@@ -32,12 +33,6 @@ namespace plugin {
 // Dynamic shape plugin requires TRT version greater than 6.0.
 #if IS_TRT_VERSION_GE(6000)

-template <typename T>
-EmbEltwiseLayernormPluginDynamicImpl<
-    T>::~EmbEltwiseLayernormPluginDynamicImpl() {}
-
-inline half fp32tofp16(float x) { return static_cast<half>(x); }
-
 template <typename T>
 void EmbEltwiseLayernormPluginDynamicImpl<T>::shareGPUData(
    const EmbEltwiseLayernormPluginDynamicImplBase *anthor) {
@@ -62,36 +57,24 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
  embs_gpu_.resize(embs_.size());
  for (int i = 0; i < embs_.size(); i++) {
    if (embs_[i]) {
-      T *host_ptr;
+      T *host_ptr = embs_[i];
      auto size = emb_sizes_[i];

-      if (std::is_same<T, half>::value) {
-        host_ptr = new T[size];
-        std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
-      } else {
-        host_ptr = reinterpret_cast<T *>(embs_[i]);
-      }
-
      cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
      cudaMemcpy(
          embs_gpu_[i], host_ptr, size * sizeof(T), cudaMemcpyHostToDevice);
-      if (std::is_same<T, half>::value) {
-        delete[] host_ptr;
-      }
    }
  }

  if (bias_) {
-    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMalloc(&bias_gpu_, sizeof(T) * bias_size_);
    cudaMemcpy(
-        bias_gpu_, bias_, bias_size_ * sizeof(float), cudaMemcpyHostToDevice);
+        bias_gpu_, bias_, bias_size_ * sizeof(T), cudaMemcpyHostToDevice);
  }
  if (scale_) {
-    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-    cudaMemcpy(scale_gpu_,
-               scale_,
-               scale_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
+    cudaMalloc(&scale_gpu_, sizeof(T) * scale_size_);
+    cudaMemcpy(
+        scale_gpu_, scale_, scale_size_ * sizeof(T), cudaMemcpyHostToDevice);
  }

  int input_num = embs_.size();
@@ -239,22 +222,14 @@ bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
                        "The EmbEltwiseLayerNorm's output should be one"
                        "but it's (%d) outputs.",
                        nb_outputs));
-  PADDLE_ENFORCE_EQ(nb_outputs,
-                    1,
-                    platform::errors::InvalidArgument(
-                        "The EmbEltwiseLayerNorm's output should be one"
-                        "but it's (%d) outputs.",
-                        nb_outputs));
+  int all_nums = nb_inputs + nb_outputs;
  PADDLE_ENFORCE_LT(
      pos,
-      nb_inputs + nb_outputs,
+      all_nums,
      platform::errors::InvalidArgument("The pos(%d) should be less than the "
                                        "num(%d) of the input and the output.",
                                        pos,
-                                        nb_inputs + nb_outputs));
-
-  int all_nums = nb_inputs + nb_outputs;
-
+                                        all_nums));
  const nvinfer1::PluginTensorDesc &desc = in_out[pos];
  if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
    return false;
@@ -269,7 +244,7 @@ bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
    return desc.type == nvinfer1::DataType::kINT32 &&
           desc.dims.d[0] == prev.dims.d[0] && desc.dims.d[1] == prev.dims.d[1];
  }
-
+  // output
  if (pos == all_nums - 1) {
    if (with_fp16_ == false) {
      return desc.type == nvinfer1::DataType::kFLOAT;
@@ -288,7 +263,7 @@ nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
      index,
      0,
      platform::errors::InvalidArgument(
-          "The EmbEltwiseLayernorm Plugin only has one input, so the "
+          "The EmbEltwiseLayernorm Plugin only has one output, so the "
          "index value should be 0, but get %d.",
          index));
  if (with_fp16_)

--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <algorithm>
+#include <cstddef>
 #include <string>
 #include <vector>

@@ -49,9 +50,9 @@ template <typename T>
 class EmbEltwiseLayernormPluginDynamicImpl
    : public EmbEltwiseLayernormPluginDynamicImplBase {
 public:
-  explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
-                                                float* bias,
-                                                float* scale,
+  explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<T*> input_embs,
+                                                T* bias,
+                                                T* scale,
                                                std::vector<int> emb_sizes,
                                                int bias_size,
                                                int scale_size,
@@ -66,7 +67,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
        hidden_size_(hidden_size),
        eps_(eps) {}

-  ~EmbEltwiseLayernormPluginDynamicImpl();
+  ~EmbEltwiseLayernormPluginDynamicImpl() {}

  int initialize();
  void terminate();
@@ -79,13 +80,13 @@ class EmbEltwiseLayernormPluginDynamicImpl
  void shareGPUData(const EmbEltwiseLayernormPluginDynamicImplBase* anthor);

 private:
-  std::vector<float*> embs_;
-  float* bias_{nullptr};
-  float* scale_{nullptr};
+  std::vector<T*> embs_;
+  T* bias_{nullptr};
+  T* scale_{nullptr};

  // data on devices
-  float* bias_gpu_{nullptr};
-  float* scale_gpu_{nullptr};
+  T* bias_gpu_{nullptr};
+  T* scale_gpu_{nullptr};
  std::vector<T*> embs_gpu_;

  std::vector<int> emb_sizes_;
@@ -101,9 +102,9 @@ class EmbEltwiseLayernormPluginDynamicImpl

 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
 public:
-  explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
-                                            float* bias,
-                                            float* scale,
+  explicit EmbEltwiseLayernormPluginDynamic(std::vector<void*> input_embs,
+                                            void* bias,
+                                            void* scale,
                                            std::vector<int> emb_sizes,
                                            int bias_size,
                                            int scale_size,
@@ -123,14 +124,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
    if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
      VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp16";
-      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(embs_,
-                                                             bias_,
-                                                             scale_,
-                                                             emb_sizes_,
-                                                             bias_size_,
-                                                             scale_size_,
-                                                             hidden_size_,
-                                                             eps_);
+      instantiateImpl<half>();
 #else
      PADDLE_THROW(platform::errors::Fatal(
          "The Ernie(Bert) tensorRT plugin should be "
@@ -141,63 +135,74 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
 #endif
    } else {
      VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp32";
-      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(embs_,
-                                                              bias_,
-                                                              scale_,
-                                                              emb_sizes_,
-                                                              bias_size_,
-                                                              scale_size_,
-                                                              hidden_size_,
-                                                              eps_);
+      instantiateImpl<float>();
    }
  }

  EmbEltwiseLayernormPluginDynamic(void const* serial_data,
                                   size_t serial_length)
      : own_host_buff_(true) {
+    // the first var is  with_fp16, we will use it.
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
    DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
-
-    embs_.resize(emb_sizes_.size());
-    for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      auto size = emb_sizes_[i];
-      auto ptr = new float[size];
-      memcpy(ptr, serial_data, sizeof(float) * size);
-      embs_[i] = ptr;
-      reinterpret_cast<char const*&>(serial_data) +=
-          emb_sizes_[i] * sizeof(float);
-      serial_length -= emb_sizes_[i] * sizeof(float);
-    }
    DeserializeValue(&serial_data, &serial_length, &bias_size_);
    DeserializeValue(&serial_data, &serial_length, &scale_size_);

-    if (bias_size_) {
-      bias_ = new float[bias_size_];
-      memcpy(bias_, serial_data, sizeof(float) * bias_size_);
-    }
-    reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
-    serial_length -= bias_size_ * sizeof(float);
+    embs_.resize(emb_sizes_.size());
+
+    if (with_fp16_) {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        auto size = emb_sizes_[i];
+        auto ptr = new half[size];
+        memcpy(ptr, serial_data, sizeof(half) * size);
+        embs_[i] = ptr;
+        reinterpret_cast<char const*&>(serial_data) += size * sizeof(half);
+        serial_length -= size * sizeof(half);
+      }
+      if (bias_size_) {
+        bias_ = new half[bias_size_];
+        memcpy(bias_, serial_data, sizeof(half) * bias_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(half);
+      serial_length -= bias_size_ * sizeof(half);

-    if (scale_size_) {
-      scale_ = new float[scale_size_];
-      memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+      if (scale_size_) {
+        scale_ = new half[scale_size_];
+        memcpy(scale_, serial_data, sizeof(half) * scale_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(half);
+      serial_length -= scale_size_ * sizeof(half);
+    } else {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        auto size = emb_sizes_[i];
+        auto ptr = new float[size];
+        memcpy(ptr, serial_data, sizeof(float) * size);
+        embs_[i] = ptr;
+        reinterpret_cast<char const*&>(serial_data) += size * sizeof(float);
+        serial_length -= size * sizeof(float);
+      }
+      if (bias_size_) {
+        bias_ = new float[bias_size_];
+        memcpy(bias_, serial_data, sizeof(float) * bias_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
+      serial_length -= bias_size_ * sizeof(float);
+
+      if (scale_size_) {
+        scale_ = new float[scale_size_];
+        memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) +=
+          scale_size_ * sizeof(float);
+      serial_length -= scale_size_ * sizeof(float);
    }
-    reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
-    serial_length -= scale_size_ * sizeof(float);

    DeserializeValue(&serial_data, &serial_length, &hidden_size_);
    DeserializeValue(&serial_data, &serial_length, &eps_);
-    DeserializeValue(&serial_data, &serial_length, &with_fp16_);

    if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
-      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(embs_,
-                                                             bias_,
-                                                             scale_,
-                                                             emb_sizes_,
-                                                             bias_size_,
-                                                             scale_size_,
-                                                             hidden_size_,
-                                                             eps_);
+      instantiateImpl<half>();
 #else
      PADDLE_THROW(platform::errors::Fatal(
          "The Ernie(Bert) tensorRT plugin should be "
@@ -207,14 +212,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
          "AnalysisConfig::Precision::kFloat32, false, false) "));
 #endif
    } else {
-      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(embs_,
-                                                              bias_,
-                                                              scale_,
-                                                              emb_sizes_,
-                                                              bias_size_,
-                                                              scale_size_,
-                                                              hidden_size_,
-                                                              eps_);
+      instantiateImpl<float>();
    }
  }

@@ -241,44 +239,68 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {

  size_t getSerializationSize() const TRT_NOEXCEPT override {
    int sum_num = 0;
+    sum_num += SerializedSize(with_fp16_);
    sum_num += SerializedSize(emb_sizes_);

-    for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      sum_num += emb_sizes_[i] * sizeof(float);
+    if (with_fp16_) {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        sum_num += emb_sizes_[i] * sizeof(half);
+      }
+      sum_num += (bias_size_ + scale_size_) * sizeof(half);
+    } else {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        sum_num += emb_sizes_[i] * sizeof(float);
+      }
+      sum_num += (bias_size_ + scale_size_) * sizeof(float);
    }

    sum_num += SerializedSize(bias_size_);
    sum_num += SerializedSize(scale_size_);

-    sum_num += (bias_size_ + scale_size_) * sizeof(float);
    sum_num += SerializedSize(hidden_size_);
    sum_num += SerializedSize(eps_);
-    sum_num += SerializedSize(with_fp16_);

    return sum_num;
  }

  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    // the first var is for with_fp16, we will use it later;
+    SerializeValue(&buffer, with_fp16_);
    SerializeValue(&buffer, emb_sizes_);
-    for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      auto size = emb_sizes_[i];
-      for (int j = 0; j < size; ++j) {
-        SerializeValue(&buffer, embs_[i][j]);
-      }
-    }
    SerializeValue(&buffer, bias_size_);
    SerializeValue(&buffer, scale_size_);
-    for (int i = 0; i < bias_size_; ++i) {
-      SerializeValue(&buffer, bias_[i]);
-    }
+    if (with_fp16_) {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        auto size = emb_sizes_[i];
+        for (int j = 0; j < size; ++j) {
+          SerializeValue(&buffer, reinterpret_cast<half*>(embs_[i])[j]);
+        }
+      }
+      for (int i = 0; i < bias_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<half*>(bias_)[i]);
+      }

-    for (int i = 0; i < scale_size_; ++i) {
-      SerializeValue(&buffer, scale_[i]);
+      for (int i = 0; i < scale_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<half*>(scale_)[i]);
+      }
+    } else {
+      for (size_t i = 0; i < emb_sizes_.size(); i++) {
+        auto size = emb_sizes_[i];
+        for (int j = 0; j < size; ++j) {
+          SerializeValue(&buffer, reinterpret_cast<float*>(embs_[i])[j]);
+        }
+      }
+      for (int i = 0; i < bias_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<float*>(bias_)[i]);
+      }
+
+      for (int i = 0; i < scale_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<float*>(scale_)[i]);
+      }
    }

    SerializeValue(&buffer, hidden_size_);
    SerializeValue(&buffer, eps_);
-    SerializeValue(&buffer, with_fp16_);
  }

  nvinfer1::DimsExprs getOutputDimensions(int output_index,
@@ -317,21 +339,28 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {

  void destroy() TRT_NOEXCEPT override {
    if (own_host_buff_) {
-      for (auto ptr : embs_) {
-        delete[] ptr;
+      if (with_fp16_) {
+        for (auto ptr : embs_) {
+          delete[] reinterpret_cast<half*>(ptr);
+        }
+        delete[] reinterpret_cast<half*>(bias_);
+        delete[] reinterpret_cast<half*>(scale_);
+      } else {
+        for (auto ptr : embs_) {
+          delete[] reinterpret_cast<float*>(ptr);
+        }
+        delete[] reinterpret_cast<float*>(bias_);
+        delete[] reinterpret_cast<float*>(scale_);
      }
-      delete[] bias_;
-      delete[] scale_;
    }
-
    delete impl_;
    delete this;
  }

 private:
-  std::vector<float*> embs_;
-  float* bias_;
-  float* scale_;
+  std::vector<void*> embs_;
+  void* bias_{nullptr};
+  void* scale_{nullptr};

  std::vector<int> emb_sizes_;
  int bias_size_;
@@ -345,6 +374,24 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  void shareGPUData(const EmbEltwiseLayernormPluginDynamic* anthor) {
    impl_->shareGPUData(anthor->impl_);
  }
+
+  template <typename U>
+  void instantiateImpl() {
+    std::vector<U*> embs;
+    embs.resize(embs_.size());
+    for (size_t i = 0; i < embs_.size(); ++i) {
+      embs[i] = reinterpret_cast<U*>(embs_[i]);
+    }
+    impl_ = new EmbEltwiseLayernormPluginDynamicImpl<U>(
+        embs,
+        reinterpret_cast<U*>(bias_),
+        reinterpret_cast<U*>(scale_),
+        emb_sizes_,
+        bias_size_,
+        scale_size_,
+        hidden_size_,
+        eps_);
+  }
 };

 class EmbEltwiseLayernormPluginDynamicCreator

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -31,31 +31,61 @@ namespace plugin {
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)

-int SkipLayerNormPluginDynamic::initialize() TRT_NOEXCEPT {
-  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_,
-             bias_.data(),
-             bias_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_,
-             scale_.data(),
-             scale_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
+template <typename T>
+void SkipLayerNormPluginDynamicImpl<T>::shareGPUData(
+    const SkipLayerNormPluginDynamicImplBase *anthor) {
+  auto *ptr = dynamic_cast<const SkipLayerNormPluginDynamicImpl<T> *>(anthor);
+  if (!ptr->is_initialized_) {
+    return;
+  }
+  scale_gpu_ = ptr->scale_gpu_;
+  bias_gpu_ = ptr->bias_gpu_;
+}
+
+template <typename T>
+int SkipLayerNormPluginDynamicImpl<T>::initialize() {
+  if (is_initialized_) {
+    return 0;
+  }
+
+  if (bias_) {
+    cudaMalloc(&bias_gpu_, sizeof(T) * bias_size_);
+    cudaMemcpy(
+        bias_gpu_, bias_, bias_size_ * sizeof(T), cudaMemcpyHostToDevice);
+  }
+  if (scale_) {
+    cudaMalloc(&scale_gpu_, sizeof(T) * scale_size_);
+    cudaMemcpy(
+        scale_gpu_, scale_, scale_size_ * sizeof(T), cudaMemcpyHostToDevice);
+  }
+
+  is_initialized_ = true;
  return 0;
 }

-void SkipLayerNormPluginDynamic::terminate() TRT_NOEXCEPT {
+template <typename T>
+void SkipLayerNormPluginDynamicImpl<T>::terminate() {
  if (bias_gpu_) {
    cudaFree(bias_gpu_);
    bias_gpu_ = nullptr;
  }
+
  if (scale_gpu_) {
    cudaFree(scale_gpu_);
    scale_gpu_ = nullptr;
  }
 }

+int SkipLayerNormPluginDynamic::initialize() TRT_NOEXCEPT {
+  impl_->initialize();
+
+  return 0;
+}
+
+void SkipLayerNormPluginDynamic::terminate() TRT_NOEXCEPT {
+  impl_->terminate();
+}
+
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
    int output_index,
    const nvinfer1::DimsExprs *inputs,
@@ -73,6 +103,12 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
      in_out,
      platform::errors::InvalidArgument(
          "The input of swish plugin shoule not be nullptr."));
+  PADDLE_ENFORCE_EQ(nb_outputs,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "The SkipLayerNorm's output should be one"
+                        "but it's (%d) outputs.",
+                        nb_outputs));

  PADDLE_ENFORCE_LT(
      pos,
@@ -82,30 +118,27 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
                                        pos,
                                        nb_inputs + nb_outputs));

-  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  const nvinfer1::PluginTensorDesc &desc = in_out[pos];
  if (pos == 0) {
    if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return (desc.type == nvinfer1::DataType::kHALF) &&
+             (desc.format == nvinfer1::TensorFormat::kLINEAR);
 #else
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return (desc.type == nvinfer1::DataType::kFLOAT) &&
+             (desc.format == nvinfer1::TensorFormat::kLINEAR);
 #endif
    } else {
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return (desc.type == nvinfer1::DataType::kFLOAT) &&
+             (desc.format == nvinfer1::TensorFormat::kLINEAR);
    }
  }
  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
-
  if (pos == 1) {
-    return in.type == prev.type && in.format == prev.format;
+    return desc.type == prev.type && desc.format == prev.format;
  }
-
  // output
-  return in.type == prev.type && in.format == prev.format;
+  return desc.type == prev.type && desc.format == prev.format;
 }

 nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
@@ -115,7 +148,7 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
  PADDLE_ENFORCE_EQ(index,
                    0,
                    platform::errors::InvalidArgument(
-                        "The SkipLayerNorm Plugin only has one input, so the "
+                        "The SkipLayerNorm Plugin only has one output, so the "
                        "index value should be 0, but get %d.",
                        index));
  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
@@ -126,7 +159,8 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
  return input_types[0];
 }

-int SkipLayerNormPluginDynamic::enqueue(
+template <typename T>
+int SkipLayerNormPluginDynamicImpl<T>::enqueue(
    const nvinfer1::PluginTensorDesc *input_desc,
    const nvinfer1::PluginTensorDesc *output_desc,
    const void *const *inputs,
@@ -138,51 +172,45 @@ int SkipLayerNormPluginDynamic::enqueue(
  int hidden = input_dims.d[2];

  auto input_type = input_desc[0].type;
-  if (input_type == nvinfer1::DataType::kFLOAT) {
-    VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp32";
-    const float *input1 = static_cast<const float *>(inputs[0]);
-    const float *input2 = static_cast<const float *>(inputs[1]);
-    float *output = static_cast<float *>(outputs[0]);
-    operators::math::SkipLayerNormFunctor<float> skip_layer_norm_func;
-    skip_layer_norm_func(num,
-                         hidden,
-                         input1,
-                         input2,
-                         scale_gpu_,
-                         bias_gpu_,
-                         output,
-                         eps_,
-                         stream);
-  } else if (input_type == nvinfer1::DataType::kHALF) {
-#ifdef TRT_PLUGIN_FP16_AVALIABLE
-    VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp16";
-    const half *input1 = static_cast<const half *>(inputs[0]);
-    const half *input2 = static_cast<const half *>(inputs[1]);
-    half *output = static_cast<half *>(outputs[0]);
-    operators::math::SkipLayerNormFunctor<half> skip_layer_norm_func;
-    skip_layer_norm_func(num,
-                         hidden,
-                         input1,
-                         input2,
-                         scale_gpu_,
-                         bias_gpu_,
-                         output,
-                         static_cast<half>(eps_),
-                         stream);
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "The Ernie(Bert) tensorRT plugin should be "
-        "complied with CUDA version >= 10.0 when running with fp16. "
-        "Please recomplie it or try to use fp32 by set "
-        "config.SetTRTDynamicShapeInfo(min_input_shape, "
-        "max_input_shape, opt_input_shape, true"));
-#endif
+
+  if (std::is_same<T, float>::value) {
+    PADDLE_ENFORCE_EQ(input_type == nvinfer1::DataType::kFLOAT,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The SkipLayernorm Plugin only support fp32 input."));
+  } else if (std::is_same<T, half>::value) {
+    PADDLE_ENFORCE_EQ(input_type == nvinfer1::DataType::kHALF,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The SkipLayernorm Plugin only support fp16 input."));
  } else {
    PADDLE_THROW(platform::errors::Fatal(
-        "The SkipLayerNorm TRT Plugin's input type should be float or half."));
+        "Unsupport data type, the out type of SkipLayernorm should be "
+        "float or half."));
  }
+  auto *output_d = reinterpret_cast<T *>(outputs[0]);
+
+  const T *input1 = reinterpret_cast<const T *>(inputs[0]);
+  const T *input2 = reinterpret_cast<const T *>(inputs[1]);
+  auto *output = reinterpret_cast<T *>(outputs[0]);
+  operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;
+  skip_layer_norm_func(
+      num, hidden, input1, input2, scale_gpu_, bias_gpu_, output, eps_, stream);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int SkipLayerNormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc,
+    const void *const *inputs,
+    void *const *outputs,
+    void *workspace,
+    cudaStream_t stream) TRT_NOEXCEPT {
+  impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
  return cudaGetLastError() != cudaSuccess;
 }
+
 #endif

 }  // namespace plugin

--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -15,11 +15,13 @@
 #pragma once

 #include <algorithm>
+#include <cstddef>
 #include <string>
 #include <vector>

 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/phi/common/data_type.h"

 namespace paddle {
 namespace inference {
@@ -27,36 +29,155 @@ namespace tensorrt {
 namespace plugin {

 #if IS_TRT_VERSION_GE(6000)
+
+class SkipLayerNormPluginDynamicImplBase {
+ public:
+  SkipLayerNormPluginDynamicImplBase() {}
+  virtual ~SkipLayerNormPluginDynamicImplBase() {}
+
+  virtual int initialize() = 0;
+  virtual void terminate() = 0;
+  virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                      const nvinfer1::PluginTensorDesc* outputDesc,
+                      const void* const* inputs,
+                      void* const* outputs,
+                      void* workspace,
+                      cudaStream_t stream) = 0;
+  virtual void shareGPUData(
+      const SkipLayerNormPluginDynamicImplBase* anthor) = 0;
+};
+
+template <typename T>
+class SkipLayerNormPluginDynamicImpl
+    : public SkipLayerNormPluginDynamicImplBase {
+ public:
+  explicit SkipLayerNormPluginDynamicImpl(
+      T* bias, T* scale, int bias_size, int scale_size, const float eps)
+      : bias_(bias),
+        scale_(scale),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        eps_(eps) {}
+
+  ~SkipLayerNormPluginDynamicImpl() {}
+
+  int initialize();
+  void terminate();
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs,
+              void* const* outputs,
+              void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT;
+  void shareGPUData(const SkipLayerNormPluginDynamicImplBase* anthor);
+
+ private:
+  T* bias_{nullptr};
+  T* scale_{nullptr};
+
+  // data on devices
+  T* bias_gpu_{nullptr};
+  T* scale_gpu_{nullptr};
+
+  int bias_size_;
+  int scale_size_;
+  float eps_;
+
+  bool is_initialized_{false};
+};
+
 class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
 public:
-  explicit SkipLayerNormPluginDynamic(const float* bias,
-                                      const float* scale,
+  explicit SkipLayerNormPluginDynamic(void* bias,
+                                      void* scale,
                                      int bias_size,
                                      int scale_size,
-                                      const float eps,
+                                      float eps,
                                      bool with_fp16)
-      : bias_size_(bias_size), scale_size_(scale_size), eps_(eps) {
+      : bias_(bias),
+        scale_(scale),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        eps_(eps),
+        own_host_buff_(false) {
    with_fp16_ = with_fp16;
-    bias_.resize(bias_size);
-    scale_.resize(scale_size);
-    std::copy(bias, bias + bias_size, bias_.data());
-    std::copy(scale, scale + scale_size, scale_.data());
+    if (with_fp16_) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+      VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp16";
+      instantiateImpl<half>();
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "The Ernie(Bert) tensorRT plugin should be "
+          "complied with CUDA version >= 10.0 when running with fp16. "
+          "Please recomplie it or try to use fp32 by set "
+          "config.EnableTensorRtEngine(1 << 30, 1, 5, "
+          "AnalysisConfig::Precision::kFloat32, false, false) "));
+#endif
+    } else {
+      VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp32";
+      instantiateImpl<float>();
+    }
  }

-  SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
-    DeserializeValue(&serial_data, &serial_length, &bias_);
-    DeserializeValue(&serial_data, &serial_length, &scale_);
+  SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length)
+      : own_host_buff_(true) {
+    // the first var is  with_fp16, we will use it.
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
    DeserializeValue(&serial_data, &serial_length, &bias_size_);
    DeserializeValue(&serial_data, &serial_length, &scale_size_);
    DeserializeValue(&serial_data, &serial_length, &eps_);
-    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+
+    if (with_fp16_) {
+      if (bias_size_) {
+        bias_ = new half[bias_size_];
+        memcpy(bias_, serial_data, sizeof(half) * bias_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(half);
+      serial_length -= bias_size_ * sizeof(half);
+
+      if (scale_size_) {
+        scale_ = new half[scale_size_];
+        memcpy(scale_, serial_data, sizeof(half) * scale_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(half);
+      serial_length -= scale_size_ * sizeof(half);
+    } else {
+      if (bias_size_) {
+        bias_ = new float[bias_size_];
+        memcpy(bias_, serial_data, sizeof(float) * bias_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
+      serial_length -= bias_size_ * sizeof(float);
+
+      if (scale_size_) {
+        scale_ = new float[scale_size_];
+        memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+      }
+      reinterpret_cast<char const*&>(serial_data) +=
+          scale_size_ * sizeof(float);
+      serial_length -= scale_size_ * sizeof(float);
+    }
+
+    if (with_fp16_) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+      instantiateImpl<half>();
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "The Ernie(Bert) tensorRT plugin should be "
+          "complied with CUDA version >= 10.0 when running with fp16. "
+          "Please recomplie it or try to use fp32 by set "
+          "config.EnableTensorRtEngine(1 << 30, 1, 5, "
+          "AnalysisConfig::Precision::kFloat32, false, false) "));
+#endif
+    } else {
+      instantiateImpl<float>();
+    }
  }

  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
    auto ptr = new SkipLayerNormPluginDynamic(
-        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, with_fp16_);
-    ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = scale_gpu_;
+        bias_, scale_, bias_size_, scale_size_, eps_, with_fp16_);
+    ptr->shareGPUData(this);
    return ptr;
  }

@@ -65,20 +186,48 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
  }
  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;

  size_t getSerializationSize() const TRT_NOEXCEPT override {
-    size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
-                      SerializedSize(bias_size_) + SerializedSize(scale_size_) +
-                      SerializedSize(eps_) + SerializedSize(with_fp16_);
-    return ser_size;
+    size_t sum_num = 0;
+    sum_num += SerializedSize(with_fp16_);
+
+    if (with_fp16_) {
+      sum_num += (bias_size_ + scale_size_) * sizeof(half);
+    } else {
+      sum_num += (bias_size_ + scale_size_) * sizeof(float);
+    }
+
+    sum_num += SerializedSize(bias_size_);
+    sum_num += SerializedSize(scale_size_);
+    sum_num += SerializedSize(eps_);
+
+    return sum_num;
  }
+
  void serialize(void* buffer) const TRT_NOEXCEPT override {
-    SerializeValue(&buffer, bias_);
-    SerializeValue(&buffer, scale_);
+    // the first var is for with_fp16, we will use it later;
+    SerializeValue(&buffer, with_fp16_);
    SerializeValue(&buffer, bias_size_);
    SerializeValue(&buffer, scale_size_);
    SerializeValue(&buffer, eps_);
-    SerializeValue(&buffer, with_fp16_);
+    if (with_fp16_) {
+      for (int i = 0; i < bias_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<half*>(bias_)[i]);
+      }
+
+      for (int i = 0; i < scale_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<half*>(scale_)[i]);
+      }
+    } else {
+      for (int i = 0; i < bias_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<float*>(bias_)[i]);
+      }
+
+      for (int i = 0; i < scale_size_; ++i) {
+        SerializeValue(&buffer, reinterpret_cast<float*>(scale_)[i]);
+      }
+    }
  }

  nvinfer1::DimsExprs getOutputDimensions(int output_index,
@@ -115,20 +264,43 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
                                       int nb_inputs) const
      TRT_NOEXCEPT override;

-  void destroy() TRT_NOEXCEPT override { delete this; }
-  void terminate() TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override {
+    if (own_host_buff_) {
+      if (with_fp16_) {
+        delete[] reinterpret_cast<half*>(bias_);
+        delete[] reinterpret_cast<half*>(scale_);
+      } else {
+        delete[] reinterpret_cast<float*>(bias_);
+        delete[] reinterpret_cast<float*>(scale_);
+      }
+    }
+    delete impl_;
+    delete this;
+  }

 private:
-  std::vector<float> bias_;
-  std::vector<float> scale_;
-
-  float* bias_gpu_{nullptr};
-  float* scale_gpu_{nullptr};
+  void* bias_{nullptr};
+  void* scale_{nullptr};

  int bias_size_;
  int scale_size_;
-
  float eps_;
+
+  bool own_host_buff_{false};
+  SkipLayerNormPluginDynamicImplBase* impl_{nullptr};
+
+  void shareGPUData(const SkipLayerNormPluginDynamic* anthor) {
+    impl_->shareGPUData(anthor->impl_);
+  }
+
+  template <typename U>
+  void instantiateImpl() {
+    impl_ = new SkipLayerNormPluginDynamicImpl<U>(reinterpret_cast<U*>(bias_),
+                                                  reinterpret_cast<U*>(scale_),
+                                                  bias_size_,
+                                                  scale_size_,
+                                                  eps_);
+  }
 };

 class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
@@ -154,8 +326,7 @@ class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
                                         const void* serial_data,
                                         size_t serial_length)
      TRT_NOEXCEPT override {
-    auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
-    return plugin;
+    return new SkipLayerNormPluginDynamic(serial_data, serial_length);
  }

  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
@@ -173,6 +344,7 @@ class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
  std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator);
+
 #endif

 }  // namespace plugin

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <cuda_fp16.h>
 #include <cstring>
 #include <string>
 #include <type_traits>
@@ -46,10 +47,11 @@ template <typename T, class Enable = void>
 struct Serializer {};

 template <typename T>
-struct Serializer<T,
-                  typename std::enable_if<std::is_arithmetic<T>::value ||
-                                          std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
+struct Serializer<
+    T,
+    typename std::enable_if<std::is_arithmetic<T>::value ||
+                            std::is_enum<T>::value || std::is_pod<T>::value ||
+                            std::is_same<T, half>::value>::type> {
  static size_t SerializedSize(T const& value) { return sizeof(T); }

  static void Serialize(void** buffer, T const& value) {
@@ -86,10 +88,11 @@ struct Serializer<const char*> {
 };

 template <typename T>
-struct Serializer<std::vector<T>,
-                  typename std::enable_if<std::is_arithmetic<T>::value ||
-                                          std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
+struct Serializer<
+    std::vector<T>,
+    typename std::enable_if<std::is_arithmetic<T>::value ||
+                            std::is_enum<T>::value || std::is_pod<T>::value ||
+                            std::is_same<T, half>::value>::type> {
  static size_t SerializedSize(std::vector<T> const& value) {
    return sizeof(value.size()) + value.size() * sizeof(T);
  }

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -98,8 +98,9 @@ static void trt_ernie(bool with_fp16, std::vector<float> result) {
  std::string model_dir = FLAGS_infer_model;
  // Delete serialization cache to perform serialization first rather than
  // deserialization.
-  std::string opt_cache_dir = FLAGS_infer_model + "/_opt_cache";
+  std::string opt_cache_dir = FLAGS_infer_model + "/opt_cache";
  delete_cache_files(opt_cache_dir);
+  config.SetOptimCacheDir(opt_cache_dir);

  SetConfig(&config, model_dir, true /* use_gpu */);


--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -15,11 +15,14 @@
 #include <paddle/fluid/platform/device_context.h>

 #include <algorithm>
+#include <type_traits>

 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

 namespace paddle {
@@ -99,19 +102,37 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
    auto *output_d = out->mutable_data<T>(context.GetPlace());
    float eps = context.Attr<float>("epsilon");

-    int shared_bytes = input_num * sizeof(int64_t);
-    math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
-    emb_eltwise_layernorm_func(batch,
-                               seq_len,
-                               hidden,
-                               in_ids_d,
-                               scale_d,
-                               bias_d,
-                               in_embs_d,
-                               output_d,
-                               eps,
-                               input_num,
-                               device_ctx.stream());
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      const half *scale_new = reinterpret_cast<const half *>(scale_d);
+      const half *bias_new = reinterpret_cast<const half *>(bias_d);
+      half *output_new = reinterpret_cast<half *>(output_d);
+
+      math::EmbEltwiseLayerNormFunctor<half> emb_eltwise_layernorm_func;
+      emb_eltwise_layernorm_func(batch,
+                                 seq_len,
+                                 hidden,
+                                 in_ids_d,
+                                 scale_new,
+                                 bias_new,
+                                 in_embs_d,
+                                 output_new,
+                                 eps,
+                                 input_num,
+                                 device_ctx.stream());
+    } else {
+      math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
+      emb_eltwise_layernorm_func(batch,
+                                 seq_len,
+                                 hidden,
+                                 in_ids_d,
+                                 scale_d,
+                                 bias_d,
+                                 in_embs_d,
+                                 output_d,
+                                 eps,
+                                 input_num,
+                                 device_ctx.stream());
+    }
  }
 };

@@ -119,6 +140,14 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
+REGISTER_OP_CUDA_KERNEL(
+    fused_embedding_eltwise_layernorm,
+    ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext, float>,
+    ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext,
+                                         paddle::platform::float16>);
+#else
 REGISTER_OP_CUDA_KERNEL(
    fused_embedding_eltwise_layernorm,
    ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext, float>);
+#endif
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -15,6 +15,7 @@
 #include <paddle/fluid/platform/device_context.h>

 #include <algorithm>
+#include <type_traits>

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -53,15 +54,34 @@ class SkipLayerNormKernel : public framework::OpKernel<T> {
    auto &device_ctx = context.template device_context<DeviceContext>();
    operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;

-    skip_layer_norm_func(num,
-                         hidden,
-                         X_d,
-                         Y_d,
-                         scale_d,
-                         bias_d,
-                         output_d,
-                         epsilon,
-                         device_ctx.stream());
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      const half *X_new = reinterpret_cast<const half *>(X_d);
+      const half *Y_new = reinterpret_cast<const half *>(Y_d);
+      const half *scale_new = reinterpret_cast<const half *>(scale_d);
+      const half *bias_new = reinterpret_cast<const half *>(bias_d);
+      half *output_new = reinterpret_cast<half *>(output_d);
+      operators::math::SkipLayerNormFunctor<half> skip_layer_norm_func;
+      skip_layer_norm_func(num,
+                           hidden,
+                           X_new,
+                           Y_new,
+                           scale_new,
+                           bias_new,
+                           output_new,
+                           epsilon,
+                           device_ctx.stream());
+    } else {
+      operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;
+      skip_layer_norm_func(num,
+                           hidden,
+                           X_d,
+                           Y_d,
+                           scale_d,
+                           bias_d,
+                           output_d,
+                           epsilon,
+                           device_ctx.stream());
+    }
  }
 };

@@ -69,5 +89,13 @@ class SkipLayerNormKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
+REGISTER_OP_CUDA_KERNEL(
+    skip_layernorm,
+    ops::SkipLayerNormKernel<phi::GPUContext, float>,
+    ops::SkipLayerNormKernel<phi::GPUContext, paddle::platform::float16>);
+#else
 REGISTER_OP_CUDA_KERNEL(skip_layernorm,
                        ops::SkipLayerNormKernel<phi::GPUContext, float>);
+#endif
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
+#include <type_traits>

 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -42,8 +43,8 @@ __device__ inline void LayerNormSmall(T val,
                                      const phi::funcs::kvp<T> &thread_data,
                                      const int ld,
                                      const int idx,
-                                      const float *bias,
-                                      const float *scale,
+                                      const T *bias,
+                                      const T *scale,
                                      T *output,
                                      T eps) {
  using BlockReduce = cub::BlockReduce<phi::funcs::kvp<T>, TPB>;
@@ -70,8 +71,8 @@ template <typename T, int TPB>
 __device__ inline void LayerNorm(const phi::funcs::kvp<T> &thread_data,
                                 const int ld,
                                 const int offset,
-                                 const float *bias,
-                                 const float *scale,
+                                 const T *bias,
+                                 const T *scale,
                                 T *output,
                                 T eps) {
  using BlockReduce = cub::BlockReduce<phi::funcs::kvp<T>, TPB>;
@@ -100,8 +101,8 @@ template <typename T, typename T2, int TPB>
 __device__ inline void LayerNorm2(const phi::funcs::kvp<T> &thread_data,
                                  const int ld,
                                  const int offset,
-                                  const float2 *bias,
-                                  const float2 *scale,
+                                  const T2 *bias,
+                                  const T2 *scale,
                                  T2 *output,
                                  T eps) {
  using BlockReduce = cub::BlockReduce<phi::funcs::kvp<T>, TPB>;
@@ -120,8 +121,8 @@ __device__ inline void LayerNorm2(const phi::funcs::kvp<T> &thread_data,
  for (int i = threadIdx.x; i < ld; i += TPB) {
    const int idx = offset + i;
    T2 val = output[idx];
-    const float2 g = scale[i];
-    const float2 b = bias[i];
+    const T2 g = scale[i];
+    const T2 b = bias[i];
    val.x = T(g.x) * (val.x - mu) * rsigma + T(b.x);
    val.y = T(g.y) * (val.y - mu) * rsigma + T(b.y);
    output[idx] = val;
@@ -131,11 +132,11 @@ __device__ inline void LayerNorm2(const phi::funcs::kvp<T> &thread_data,
 template <typename T, unsigned TPB>
 __global__ void EmbEltwiseLayernormKernel(int hidden,
                                          const int64_t *ids,
-                                          const float *scale,
-                                          const float *bias,
+                                          const T *scale,
+                                          const T *bias,
                                          const int64_t *embs,
                                          T *output,
-                                          float eps,
+                                          T eps,
                                          int input_num) {
  cub::Sum pair_sum;
  // blockIdx.x: position in the sequence
@@ -179,11 +180,11 @@ __global__ void EmbEltwiseLayernormKernel(int hidden,
 template <>
 __global__ void EmbEltwiseLayernormKernel<half, 256>(int hidden,
                                                     const int64_t *ids,
-                                                     const float *scale,
-                                                     const float *bias,
+                                                     const half *scale,
+                                                     const half *bias,
                                                     const int64_t *embs,
                                                     half *output,
-                                                     float eps,
+                                                     half eps,
                                                     int input_num) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  cub::Sum pair_sum;
@@ -231,8 +232,8 @@ void EmbEltwiseLayerNormFunctor<T>::operator()(int batch,
                                               int seq_len,
                                               int hidden,
                                               const int64_t *ids,
-                                               const float *scale,
-                                               const float *bias,
+                                               const T *scale,
+                                               const T *bias,
                                               const int64_t *embs,
                                               T *output,
                                               float eps,
@@ -720,9 +721,9 @@ __global__ void SkipLayerNormSmallKernel(int num,
                                         const T *input1,
                                         const T *input2,
                                         T *output,
-                                         const float *scale,
-                                         const float *bias,
-                                         float eps) {
+                                         const T *scale,
+                                         const T *bias,
+                                         T eps) {
  const T rld = T(1) / T(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
@@ -747,9 +748,9 @@ __global__ void SkipLayerNormSmallKernel<half, 32>(int num,
                                                   const half *input1,
                                                   const half *input2,
                                                   half *output,
-                                                   const float *scale,
-                                                   const float *bias,
-                                                   float eps) {
+                                                   const half *scale,
+                                                   const half *bias,
+                                                   half eps) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
@@ -774,9 +775,9 @@ __global__ void SkipLayerNormSmallKernel<half, 128>(int num,
                                                    const half *input1,
                                                    const half *input2,
                                                    half *output,
-                                                    const float *scale,
-                                                    const float *bias,
-                                                    float eps) {
+                                                    const half *scale,
+                                                    const half *bias,
+                                                    half eps) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
@@ -801,9 +802,9 @@ __global__ void SkipLayerNormSmallKernel<half, 384>(int num,
                                                    const half *input1,
                                                    const half *input2,
                                                    half *output,
-                                                    const float *scale,
-                                                    const float *bias,
-                                                    float eps) {
+                                                    const half *scale,
+                                                    const half *bias,
+                                                    half eps) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
@@ -829,9 +830,9 @@ __global__ void SkipLayerNormKernel(int num,
                                    const T *input1,
                                    const T *input2,
                                    T *output,
-                                    const float *scale,
-                                    const float *bias,
-                                    float eps) {
+                                    const T *scale,
+                                    const T *bias,
+                                    T eps) {
  const T rld = T(1) / T(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
@@ -856,9 +857,9 @@ __global__ void SkipLayerNormKernel<half, 256>(int num,
                                               const half *input1,
                                               const half *input2,
                                               half *output,
-                                               const float *scale,
-                                               const float *bias,
-                                               float eps) {
+                                               const half *scale,
+                                               const half *bias,
+                                               half eps) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
@@ -884,8 +885,8 @@ __global__ void SkipLayerNormKernel2(int num,
                                     const T2 *input1,
                                     const T2 *input2,
                                     T2 *output,
-                                     const float2 *scale,
-                                     const float2 *bias,
+                                     const T2 *scale,
+                                     const T2 *bias,
                                     float eps) {
  const T rld = T(0.5f / hidden);  // because hidden is hidden/2
  const int offset = blockIdx.x * hidden;
@@ -912,8 +913,8 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>(int num,
                                                       const half2 *input1,
                                                       const half2 *input2,
                                                       half2 *output,
-                                                       const float2 *scale,
-                                                       const float2 *bias,
+                                                       const half2 *scale,
+                                                       const half2 *bias,
                                                       float eps) {
 // operator "+" of half only suppotted after cuda version 10.0
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000
@@ -942,10 +943,10 @@ void SkipLayerNormFunctor<T>::operator()(const int num,
                                         const int hidden,
                                         const T *input1,
                                         const T *input2,
-                                         const float *scale,
-                                         const float *bias,
+                                         const T *scale,
+                                         const T *bias,
                                         T *output,
-                                         T eps,
+                                         float eps,
                                         gpuStream_t stream) {
  int block = num / hidden;
  if (hidden <= 32) {
@@ -984,8 +985,8 @@ void SkipLayerNormFunctor<T>::operator()(const int num,
                reinterpret_cast<const __half2 *>(input1),
                reinterpret_cast<const __half2 *>(input2),
                reinterpret_cast<__half2 *>(output),
-                reinterpret_cast<const float2 *>(scale),
-                reinterpret_cast<const float2 *>(bias),
+                reinterpret_cast<const __half2 *>(scale),
+                reinterpret_cast<const __half2 *>(bias),
                eps);
 #endif
      } else {

--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -68,8 +68,8 @@ class EmbEltwiseLayerNormFunctor {
                  int seq_len,
                  int hidden,
                  const int64_t *ids,
-                  const float *scale,
-                  const float *bias,
+                  const T *scale,
+                  const T *bias,
                  const int64_t *embs,
                  T *output,
                  float eps,
@@ -125,10 +125,10 @@ class SkipLayerNormFunctor {
                  const int hidden,
                  const T *input1,
                  const T *input2,
-                  const float *scale,
-                  const float *bias,
+                  const T *scale,
+                  const T *bias,
                  T *output,
-                  T eps,
+                  float eps,
                  gpuStream_t stream);
 };
 #endif

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -562,6 +562,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
      }
      runtime_batch = t_shape[0];
      VLOG(1) << "trt input [" << x << "] dtype is " << t.dtype();
+
      auto indata_type = inference::tensorrt::PhiType2NvType(t.dtype());
      auto intrt_index = engine->engine()->getBindingIndex(x.c_str());
      auto intrt_type = engine->engine()->getBindingDataType(intrt_index);
@@ -570,6 +571,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
                        platform::errors::InvalidArgument(
                            "The TRT Engine OP's input type should equal "
                            "to the input data type"));
+
      auto type = framework::TransToProtoVarType(t.dtype());
      if (type == framework::proto::VarType::FP32) {
        buffers[bind_index] = static_cast<void *>(t.data<float>());