diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 5676c3facec794923b58eef6bb1243a0960f4737..3797a561a4912c5a068f33017ad04619339dd09d 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     if (engine_->with_dynamic_shape()) {
-      auto use_fp16 = engine_->WithFp16();
       plugin::DynamicPluginTensorRT* plugin = nullptr;
-      if (use_fp16) {
-#ifdef SUPPORTS_CUDA_FP16
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<half>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#else
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#endif
-      } else {
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-      }
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps);
       layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 03fd7a283b44c0744fdfb1a6ad50ecf28cc92511..9ca0e979bf6e74d1730c8e2ae1a26cbf7e0ea576 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -200,9 +200,23 @@ class TensorRTEngine {
   void Deserialize(const std::string& engine_serialized_data) {
     freshDeviceId();
     infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size(),
-        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    if (with_dynamic_shape_) {
+#if IS_TRT_VERSION_GE(6000)
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          nullptr));
+#else
+
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "To enable dynamic shape support, the TensorRT version should be "
+          "greater than 6.0.0"));
+
+#endif
+    } else {
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    }
     PADDLE_ENFORCE(infer_engine_ != nullptr,
                    "build cuda engine failed when deserialize engine info.!");
   }
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 037dabf5d5888aecdaf781de474c35098be144c1..55a57caf9a0d6eb44399ceb8064b613afb955d47 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
   return static_cast<nvinfer1::IRuntime*>(
       dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
+static nvinfer1::IPluginRegistry* getPluginRegistry() {
+  return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
+}
 
 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 575dfa68e6ec6c5c972f3246a291c4bcf00bb781..e7f9381e97137d77d27b54cac910bfee9f629464 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -33,53 +33,29 @@ namespace plugin {
 
 template <typename T>
 int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
-  int nb_emb = embs_.size();
-  std::vector<void *> ptr_vector(nb_emb);
-  std::vector<std::vector<half>> emb_fp16(nb_emb);
-
-  if (sizeof(T) == sizeof(float)) {
-    // FP32
-    for (int i = 0; i < nb_emb; ++i) {
-      ptr_vector[i] = embs_[i];
-    }
-  } else {
-    // FP16
-    for (int i = 0; i < nb_emb; ++i) {
-      auto emb_size = emb_sizes_[i];
-      auto &tmp = emb_fp16[i];
-      tmp.resize(emb_size);
-
-      for (int j = 0; j < emb_size; ++j) {
-        tmp[j] = static_cast<half>(embs_[i][j]);
-      }
-      ptr_vector[i] = tmp.data();
-    }
-  }
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
-    cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]);
-    cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T),
-               cudaMemcpyHostToDevice);
+    if (embs_[i]) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+    }
   }
 
-  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-
-  return 0;
-}
+  if (bias_) {
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
+  if (scale_) {
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
 
-template <typename T>
-size_t EmbEltwiseLayernormPluginDynamic<T>::getSerializationSize() const {
   return 0;
 }
 
-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::serialize(void *buffer) const {}
-
 template <typename T>
 nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index d0b5a4a5d6a085f22777652619540ef8b3d5f54c..8ac611cd7c62fddfd4f01d7705b841abc28501d3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
         hidden_size_(hidden_size),
         eps_(eps) {}
 
-  EmbEltwiseLayernormPluginDynamic(void const* serialData,
-                                   size_t serialLength) {}
+  EmbEltwiseLayernormPluginDynamic(void const* serial_data,
+                                   size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
+
+    embs_gpu_.resize(emb_sizes_.size());
+    embs_.resize(emb_sizes_.size());
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+      reinterpret_cast<char const*&>(serial_data) +=
+          emb_sizes_[i] * sizeof(float);
+      serial_length -= emb_sizes_[i] * sizeof(float);
+      embs_[i] = nullptr;
+    }
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    bias_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
+    serial_length -= bias_size_ * sizeof(float);
+
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    scale_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
+    serial_length -= scale_size_ * sizeof(float);
+
+    DeserializeValue(&serial_data, &serial_length, &hidden_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+  }
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    int sum_num = 0;
+    sum_num += SerializedSize(emb_sizes_);
+
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      sum_num += emb_sizes_[i] * sizeof(float);
+    }
+
+    sum_num += SerializedSize(bias_size_);
+    sum_num += SerializedSize(scale_size_);
+
+    sum_num += (bias_size_ + scale_size_) * sizeof(float);
+    sum_num += SerializedSize(hidden_size_);
+    sum_num += SerializedSize(eps_);
+    // sum_num += SerializedSize(with_fp16_);
+
+    return sum_num;
+  }
+
+  void serialize(void* buffer) const override {
+    // SerializeValue(&buffer, with_fp16_);
+    SerializeValue(&buffer, emb_sizes_);
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+    }
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
+    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    SerializeValue(&buffer, hidden_size_);
+    SerializeValue(&buffer, eps_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   // data on devices
   float* bias_gpu_;
   float* scale_gpu_;
-  std::vector<T*> embs_gpu_;
+  std::vector<float*> embs_gpu_;
 
   std::vector<int> emb_sizes_;
   int bias_size_;
@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   int hidden_size_;
   float eps_;
 };
+
+class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  EmbEltwiseLayernormPluginV2Creator() {}
+  const char* getPluginName() const override {
+    return "fused_embedding_eltwise_layernorm_plugin";
+  }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
+                                                       serial_length);
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index 6a718d47b1542b3cce97f6ff1f8744b4d58a8102..854ca515db5f5904abb47378fcbec1ea3b391fb9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
 
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
-size_t GeluPluginDynamic::getSerializationSize() const { return 0; }
-
-void GeluPluginDynamic::serialize(void* buffer) const {}
 
 nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index e4dcce3a856df027c9e360dbcce295d6afa52642..02219bc27a763569484db0ba06259abd001e514d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {
 
   // It was used for tensorrt deserialization.
   // It should not be called by users.
-  GeluPlugin(void const* serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
+  GeluPlugin(void const* serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
   }
 
   ~GeluPlugin() {}
@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
   bool supportsFormat(nvinfer1::DataType type,
                       nvinfer1::PluginFormat format) const override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
               void* workspace, cudaStream_t stream) override;
 
  protected:
@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
 class GeluPluginDynamic : public DynamicPluginTensorRT {
  public:
   GeluPluginDynamic() {}
-  GeluPluginDynamic(void const* serialData, size_t serialLength) {}
+  GeluPluginDynamic(void const* serial_data, size_t serial_length) {}
 
   ~GeluPluginDynamic() {}
   nvinfer1::IPluginV2DynamicExt* clone() const override {
@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override { return 0; }
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override { return 0; }
+  void serialize(void* buffer) const override {}
 
   nvinfer1::DimsExprs getOutputDimensions(
-      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 };
+
+class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  GeluPluginV2Creator() {}
+  const char* getPluginName() const override { return "gelu_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GeluPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index fe3ea180593b914b1fec948644723ec0a535b4d7..5014a7db981e932b0d606ea15761fdc460550a11 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -141,10 +141,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
 
 int QkvToContextPluginDynamic::initialize() { return 0; }
 
-size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; }
-
-void QkvToContextPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index 18c5d27fc45cfe9394416cededc2b852b5f5651a..72a2732ae2021a19b24dd9bfe5bf3a965c937712 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -1,3 +1,17 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
         scale_(scale),
         ban_fp16_(ban_fp16) {}
 
-  QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {}
+  QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &hidden_);
+    DeserializeValue(&serial_data, &serial_length, &head_number_);
+    DeserializeValue(&serial_data, &serial_length, &head_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
                                          scale_, ban_fp16_);
@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    return SerializedSize(hidden_) + SerializedSize(head_number_) +
+           SerializedSize(head_size_) + SerializedSize(scale_) +
+           SerializedSize(ban_fp16_);
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, hidden_);
+    SerializeValue(&buffer, head_number_);
+    SerializeValue(&buffer, head_size_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, ban_fp16_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   float scale_;
   bool ban_fp16_;
 };
+
+class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  QkvToContextPluginV2Creator() {}
+  const char* getPluginName() const override { return "qkv_to_context_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 4fb9bbc1ff8627067b910da1a588a5c25786dc25..c51dae5e00c12ee632f29c59c6acd9a36e58b945 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -32,18 +32,14 @@ namespace plugin {
 
 int SkipLayerNormPluginDynamic::initialize() {
   cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+  cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
              cudaMemcpyHostToDevice);
   cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+  cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float),
              cudaMemcpyHostToDevice);
   return 0;
 }
 
-size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
-
-void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index ede90d7db169eab5fe2c14091ba74b31a8d546df..8fe1edc4bf0321b054322a27f0c16819bc023ed8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -29,61 +29,84 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
  public:
-  explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size,
-                                      int scale_size, const float eps,
-                                      bool ban_fp16)
-      : bias_(bias),
-        scale_(scale),
-        bias_size_(bias_size),
+  explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
+                                      int bias_size, int scale_size,
+                                      const float eps, bool ban_fp16)
+      : bias_size_(bias_size),
         scale_size_(scale_size),
         eps_(eps),
-        ban_fp16_(ban_fp16) {}
-  SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {}
+        ban_fp16_(ban_fp16) {
+    bias_.resize(bias_size);
+    scale_.resize(scale_size);
+    std::copy(bias, bias + bias_size, bias_.data());
+    std::copy(scale, scale + scale_size, scale_.data());
+  }
+  SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &bias_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_,
-                                          scale_size_, eps_, ban_fp16_);
+    return new SkipLayerNormPluginDynamic(
+        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
   }
 
   const char* getPluginType() const override { return "skip_layernorm_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
+                      SerializedSize(bias_size_) + SerializedSize(scale_size_) +
+                      SerializedSize(eps_) + SerializedSize(eps_);
+    return ser_size;
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, ban_fp16_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
  private:
-  float* bias_;
-  float* scale_;
+  std::vector<float> bias_;
+  std::vector<float> scale_;
 
   float* bias_gpu_;
   float* scale_gpu_;
@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   float eps_;
   bool ban_fp16_;
 };
+
+class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SkipLayerNormPluginV2Creator() {}
+  const char* getPluginName() const override { return "skip_layernorm_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 0fc0f7bf35dec379cc22ade3da2bb314744d0929..33eec618ff62bd04e368d63839b7ee669f7f9519 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -175,11 +175,29 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   void serializeBase(void*& buffer) const;  // NOLINT
 
  private:
-  std::string name_space_{"paddle_trt"};
-  std::string plugin_base_{"plugin_dynamic"};
+  std::string name_space_;
+  std::string plugin_base_;
 };
 #endif
 
+template <typename T>
+class TrtPluginRegistrarV2 {
+ public:
+  TrtPluginRegistrarV2() {
+    static auto func_ptr = getPluginRegistry();
+    if (func_ptr != nullptr) {
+      func_ptr->registerCreator(creator, "");
+    }
+  }
+
+ private:
+  T creator;
+};
+
+#define REGISTER_TRT_PLUGIN_V2(name)                                     \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
+      plugin_registrar_##name {}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 1cae4ccae4cc593785d9b3b0e87523e740eef4ff..18037179c7b98952b6088361954e869ecedfb2c7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
   return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
 
+template <typename T>
+inline void SerializeCudaPointer(void** buffer, T* value, int size) {
+  cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost);
+  reinterpret_cast<char*&>(*buffer) += size * sizeof(T);
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 5fca1d0b6eddb0d37ef2082b0dc4f86d39219128..8274ca86dc4225946848778cbac689674b789843 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -420,6 +420,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
+
+    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+
+    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
+        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized)
+
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6526b87436557b7f0c5c6dc5d3b59f2d70323d84
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true /* use_gpu */);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int head_number = 12;
+  int batch = 1;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {batch, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.597841, 0.219972, 0.182187};
+  trt_ernie(false, result);
+}
+
+TEST(AnalysisPredictor, fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+  trt_ernie(true, result);
+#endif
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 5fbf0867ba2864c17c15c3368ec8cccdd6221a61..babe9977cd571f588f0bdc5a6723d4b05afab72b 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   if (with_fp16) {
     precision = AnalysisConfig::Precision::kHalf;
   }
-  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true);
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false);
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
   std::vector<float> out_data;
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 5d47c066d6eac7bc7b6f1888cb688768cbdfda94..4004ad401ee7f509edb00fe1424bcf69665e17a3 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_runtime.h>
+#include <algorithm>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index f3c8e27944ca9b6419de87d752df3a83751039b1..c9c3a9456b736ee1afb2efbe9bf092e2ae298372 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -13,18 +13,62 @@
    limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/tensorrt.h"
+#include <string>
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
 std::once_flag tensorrt_dso_flag;
-void *tensorrt_dso_handle;
+void* tensorrt_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+  void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "TensorRT dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if TensorRT "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure TensorRT dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    std::cerr << string::Sprintf(error_msg, dso_name, errorno);
+  }
+
+  return dso_handle;
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 4c7ba0f054cfc80702eb4fb4127d7008f6e49c02..34ad1e74588805b20543ba68869bf4e466b5911f 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -27,42 +27,32 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
+void* GetTensorRtHandle();
+
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;
 
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
-  struct DynLoad__##__name {                                            \
-    template <typename... Args>                                         \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(&::__name);                        \
-      std::call_once(tensorrt_dso_flag, []() {                          \
-        tensorrt_dso_handle =                                           \
-            paddle::platform::dynload::GetTensorRtDsoHandle();          \
-        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
-      });                                                               \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
-      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
-    }                                                                   \
-  };                                                                    \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      using tensorrt_func = decltype(&::__name);                              \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      if (p_##__name == nullptr) {                                            \
+        return nullptr;                                                       \
+      }                                                                       \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
   extern DynLoad__##__name __name
 
-#else
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
-  struct DynLoad__##__name {                       \
-    template <typename... Args>                    \
-    tensorrtResult_t operator()(Args... args) {    \
-      return __name(args...);                      \
-    }                                              \
-  };                                               \
-  extern DynLoad__##__name __name
-#endif
-
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);
+  __macro(createInferRuntime_INTERNAL);     \
+  __macro(getPluginRegistry);
 
 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)