From b717895f64a3710bb79aa1bd66ce7848c406941f Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Wed, 5 Aug 2020 10:36:32 +0800
Subject: [PATCH] Fix registering trt plugin (#25744)

* develop dynamic shape serilization

* add test param for gelu

* fix bugs

* delete redundant comments

* debug

* fix conflict. test=develop

* fix bug. test=develop

* add trt dynamic shape serialized support

* fix ernie serialized bug
test=develop

* fix codestyle
test=develop

* fix bug
test=develop

* fix bug.test=develop

* modify cmakelist test=develop

* fix bug
test=develop

* fix error message.  test=develop

* fix trt register plugin based on pr#25003

* add trt dynload

* fix deserialization bug of not finding plugin registration

* refine code style

* recover engine key in tensorrt_subgraph_pass

* for ci coverage

* add unittest for deserialization

Co-authored-by: haozech <chenhaoze94@gmail.com>
---
 .../tensorrt/convert/emb_eltwise_layernorm.cc |  22 +--
 paddle/fluid/inference/tensorrt/engine.h      |  20 ++-
 paddle/fluid/inference/tensorrt/helper.h      |   3 +
 .../plugin/emb_eltwise_layernorm_plugin.cu    |  54 ++-----
 .../plugin/emb_eltwise_layernorm_plugin.h     | 137 ++++++++++++++--
 .../tensorrt/plugin/gelu_op_plugin.cu         |   3 -
 .../tensorrt/plugin/gelu_op_plugin.h          |  78 +++++++---
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |   4 -
 .../tensorrt/plugin/qkv_to_context_plugin.h   |  91 +++++++++--
 .../plugin/skip_layernorm_op_plugin.cu        |   8 +-
 .../plugin/skip_layernorm_op_plugin.h         | 110 ++++++++++---
 .../inference/tensorrt/plugin/trt_plugin.h    |  17 +-
 .../tensorrt/plugin/trt_plugin_utils.h        |   6 +
 .../fluid/inference/tests/api/CMakeLists.txt  |  19 +++
 ...rt_dynamic_shape_ernie_deserialize_test.cc | 146 ++++++++++++++++++
 .../tests/api/trt_dynamic_shape_ernie_test.cc |   2 +-
 .../operators/math/bert_encoder_functor.cu    |   1 +
 paddle/fluid/platform/dynload/tensorrt.cc     |  46 +++++-
 paddle/fluid/platform/dynload/tensorrt.h      |   8 +-
 19 files changed, 625 insertions(+), 150 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 0f2e71cd5b5..cdc0e415d46 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     if (engine_->with_dynamic_shape()) {
-      auto use_fp16 = engine_->WithFp16();
       plugin::DynamicPluginTensorRT* plugin = nullptr;
-      if (use_fp16) {
-#ifdef SUPPORTS_CUDA_FP16
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<half>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#else
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-#endif
-      } else {
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps);
-      }
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps);
       layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 0a4f1c94f90..fdd71b0d884 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -200,9 +200,23 @@ class TensorRTEngine {
   void Deserialize(const std::string& engine_serialized_data) {
     freshDeviceId();
     infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size(),
-        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    if (with_dynamic_shape_) {
+#if IS_TRT_VERSION_GE(6000)
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          nullptr));
+#else
+
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "To enable dynamic shape support, the TensorRT version should be "
+          "greater than 6.0.0"));
+
+#endif
+    } else {
+      infer_engine_.reset(runtime->deserializeCudaEngine(
+          engine_serialized_data.c_str(), engine_serialized_data.size(),
+          &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    }
     PADDLE_ENFORCE(infer_engine_ != nullptr,
                    "build cuda engine failed when deserialize engine info.!");
   }
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 037dabf5d58..55a57caf9a0 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
   return static_cast<nvinfer1::IRuntime*>(
       dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
+static nvinfer1::IPluginRegistry* getPluginRegistry() {
+  return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
+}
 
 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 575dfa68e6e..e7f9381e971 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -33,53 +33,29 @@ namespace plugin {
 
 template <typename T>
 int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
-  int nb_emb = embs_.size();
-  std::vector<void *> ptr_vector(nb_emb);
-  std::vector<std::vector<half>> emb_fp16(nb_emb);
-
-  if (sizeof(T) == sizeof(float)) {
-    // FP32
-    for (int i = 0; i < nb_emb; ++i) {
-      ptr_vector[i] = embs_[i];
-    }
-  } else {
-    // FP16
-    for (int i = 0; i < nb_emb; ++i) {
-      auto emb_size = emb_sizes_[i];
-      auto &tmp = emb_fp16[i];
-      tmp.resize(emb_size);
-
-      for (int j = 0; j < emb_size; ++j) {
-        tmp[j] = static_cast<half>(embs_[i][j]);
-      }
-      ptr_vector[i] = tmp.data();
-    }
-  }
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
-    cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]);
-    cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T),
-               cudaMemcpyHostToDevice);
+    if (embs_[i]) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+    }
   }
 
-  cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-  cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
-             cudaMemcpyHostToDevice);
-
-  return 0;
-}
+  if (bias_) {
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
+  if (scale_) {
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+  }
 
-template <typename T>
-size_t EmbEltwiseLayernormPluginDynamic<T>::getSerializationSize() const {
   return 0;
 }
 
-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::serialize(void *buffer) const {}
-
 template <typename T>
 nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index d0b5a4a5d6a..8ac611cd7c6 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
         hidden_size_(hidden_size),
         eps_(eps) {}
 
-  EmbEltwiseLayernormPluginDynamic(void const* serialData,
-                                   size_t serialLength) {}
+  EmbEltwiseLayernormPluginDynamic(void const* serial_data,
+                                   size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
+
+    embs_gpu_.resize(emb_sizes_.size());
+    embs_.resize(emb_sizes_.size());
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
+      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
+                 cudaMemcpyHostToDevice);
+      reinterpret_cast<char const*&>(serial_data) +=
+          emb_sizes_[i] * sizeof(float);
+      serial_length -= emb_sizes_[i] * sizeof(float);
+      embs_[i] = nullptr;
+    }
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+
+    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
+    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    bias_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
+    serial_length -= bias_size_ * sizeof(float);
+
+    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
+    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
+               cudaMemcpyHostToDevice);
+    scale_ = nullptr;
+    reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
+    serial_length -= scale_size_ * sizeof(float);
+
+    DeserializeValue(&serial_data, &serial_length, &hidden_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+  }
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    int sum_num = 0;
+    sum_num += SerializedSize(emb_sizes_);
+
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      sum_num += emb_sizes_[i] * sizeof(float);
+    }
+
+    sum_num += SerializedSize(bias_size_);
+    sum_num += SerializedSize(scale_size_);
+
+    sum_num += (bias_size_ + scale_size_) * sizeof(float);
+    sum_num += SerializedSize(hidden_size_);
+    sum_num += SerializedSize(eps_);
+    // sum_num += SerializedSize(with_fp16_);
+
+    return sum_num;
+  }
+
+  void serialize(void* buffer) const override {
+    // SerializeValue(&buffer, with_fp16_);
+    SerializeValue(&buffer, emb_sizes_);
+    for (size_t i = 0; i < emb_sizes_.size(); i++) {
+      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+    }
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
+    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    SerializeValue(&buffer, hidden_size_);
+    SerializeValue(&buffer, eps_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   // data on devices
   float* bias_gpu_;
   float* scale_gpu_;
-  std::vector<T*> embs_gpu_;
+  std::vector<float*> embs_gpu_;
 
   std::vector<int> emb_sizes_;
   int bias_size_;
@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   int hidden_size_;
   float eps_;
 };
+
+class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  EmbEltwiseLayernormPluginV2Creator() {}
+  const char* getPluginName() const override {
+    return "fused_embedding_eltwise_layernorm_plugin";
+  }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
+                                                       serial_length);
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+
 #endif
 }  // namespace plugin
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index f94da83b908..03edb54ca3d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
 
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
-size_t GeluPluginDynamic::getSerializationSize() const { return 0; }
-
-void GeluPluginDynamic::serialize(void* buffer) const {}
 
 nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index e4dcce3a856..02219bc27a7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {
 
   // It was used for tensorrt deserialization.
   // It should not be called by users.
-  GeluPlugin(void const* serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
+  GeluPlugin(void const* serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
   }
 
   ~GeluPlugin() {}
@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
   bool supportsFormat(nvinfer1::DataType type,
                       nvinfer1::PluginFormat format) const override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
               void* workspace, cudaStream_t stream) override;
 
  protected:
@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
 class GeluPluginDynamic : public DynamicPluginTensorRT {
  public:
   GeluPluginDynamic() {}
-  GeluPluginDynamic(void const* serialData, size_t serialLength) {}
+  GeluPluginDynamic(void const* serial_data, size_t serial_length) {}
 
   ~GeluPluginDynamic() {}
   nvinfer1::IPluginV2DynamicExt* clone() const override {
@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override { return 0; }
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override { return 0; }
+  void serialize(void* buffer) const override {}
 
   nvinfer1::DimsExprs getOutputDimensions(
-      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 };
+
+class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  GeluPluginV2Creator() {}
+  const char* getPluginName() const override { return "gelu_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GeluPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 240ecaa2589..447769db132 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -152,10 +152,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
 
 int QkvToContextPluginDynamic::initialize() { return 0; }
 
-size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; }
-
-void QkvToContextPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index 18c5d27fc45..72a2732ae20 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -1,3 +1,17 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
         scale_(scale),
         ban_fp16_(ban_fp16) {}
 
-  QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {}
+  QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &hidden_);
+    DeserializeValue(&serial_data, &serial_length, &head_number_);
+    DeserializeValue(&serial_data, &serial_length, &head_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
                                          scale_, ban_fp16_);
@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    return SerializedSize(hidden_) + SerializedSize(head_number_) +
+           SerializedSize(head_size_) + SerializedSize(scale_) +
+           SerializedSize(ban_fp16_);
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, hidden_);
+    SerializeValue(&buffer, head_number_);
+    SerializeValue(&buffer, head_size_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, ban_fp16_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   float scale_;
   bool ban_fp16_;
 };
+
+class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  QkvToContextPluginV2Creator() {}
+  const char* getPluginName() const override { return "qkv_to_context_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 4fb9bbc1ff8..c51dae5e00c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -32,18 +32,14 @@ namespace plugin {
 
 int SkipLayerNormPluginDynamic::initialize() {
   cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-  cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
+  cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
              cudaMemcpyHostToDevice);
   cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-  cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
+  cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float),
              cudaMemcpyHostToDevice);
   return 0;
 }
 
-size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
-
-void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
-
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index ede90d7db16..8fe1edc4bf0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -29,61 +29,84 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
  public:
-  explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size,
-                                      int scale_size, const float eps,
-                                      bool ban_fp16)
-      : bias_(bias),
-        scale_(scale),
-        bias_size_(bias_size),
+  explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
+                                      int bias_size, int scale_size,
+                                      const float eps, bool ban_fp16)
+      : bias_size_(bias_size),
         scale_size_(scale_size),
         eps_(eps),
-        ban_fp16_(ban_fp16) {}
-  SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {}
+        ban_fp16_(ban_fp16) {
+    bias_.resize(bias_size);
+    scale_.resize(scale_size);
+    std::copy(bias, bias + bias_size, bias_.data());
+    std::copy(scale, scale + scale_size, scale_.data());
+  }
+  SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &bias_);
+    DeserializeValue(&serial_data, &serial_length, &scale_);
+    DeserializeValue(&serial_data, &serial_length, &bias_size_);
+    DeserializeValue(&serial_data, &serial_length, &scale_size_);
+    DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  }
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_,
-                                          scale_size_, eps_, ban_fp16_);
+    return new SkipLayerNormPluginDynamic(
+        bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
   }
 
   const char* getPluginType() const override { return "skip_layernorm_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const override {
+    size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
+                      SerializedSize(bias_size_) + SerializedSize(scale_size_) +
+                      SerializedSize(eps_) + SerializedSize(eps_);
+    return ser_size;
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, bias_size_);
+    SerializeValue(&buffer, scale_size_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, ban_fp16_);
+  }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
       nvinfer1::IExprBuilder& expr_builder) override;
 
   bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs, int nb_outputs) override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
+                       int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nb_outputs) override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
+                          int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nb_outputs) const override {
     return 0;
   }
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
               cudaStream_t stream) override;
   nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override;
 
   void destroy() override { delete this; }
 
  private:
-  float* bias_;
-  float* scale_;
+  std::vector<float> bias_;
+  std::vector<float> scale_;
 
   float* bias_gpu_;
   float* scale_gpu_;
@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   float eps_;
   bool ban_fp16_;
 };
+
+class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SkipLayerNormPluginV2Creator() {}
+  const char* getPluginName() const override { return "skip_layernorm_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 0fc0f7bf35d..f4424b8b785 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -175,11 +175,24 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   void serializeBase(void*& buffer) const;  // NOLINT
 
  private:
-  std::string name_space_{"paddle_trt"};
-  std::string plugin_base_{"plugin_dynamic"};
+  std::string name_space_;
+  std::string plugin_base_;
 };
 #endif
 
+template <typename T>
+class TrtPluginRegistrarV2 {
+ public:
+  TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
+
+ private:
+  T creator;
+};
+
+#define REGISTER_TRT_PLUGIN_V2(name)                                     \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
+      plugin_registrar_##name {}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 1cae4ccae4c..18037179c7b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
   return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
 
+template <typename T>
+inline void SerializeCudaPointer(void** buffer, T* value, int size) {
+  cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost);
+  reinterpret_cast<char*&>(*buffer) += size * sizeof(T);
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 62c9dfa0d9d..6bc7728487c 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -433,6 +433,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
+
+    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+
+    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
+    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
+
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
new file mode 100644
index 00000000000..6526b874365
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true /* use_gpu */);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int head_number = 12;
+  int batch = 1;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {batch, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.597841, 0.219972, 0.182187};
+  trt_ernie(false, result);
+}
+
+TEST(AnalysisPredictor, fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+  trt_ernie(true, result);
+#endif
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 5fbf0867ba2..babe9977cd5 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   if (with_fp16) {
     precision = AnalysisConfig::Precision::kHalf;
   }
-  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true);
+  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false);
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
   std::vector<float> out_data;
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 59a79bcb699..35b4c40d6d7 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_runtime.h>
+#include <algorithm>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index f3c8e27944c..c9c3a9456b7 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -13,18 +13,62 @@
    limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/tensorrt.h"
+#include <string>
 
 namespace paddle {
 namespace platform {
 namespace dynload {
 
 std::once_flag tensorrt_dso_flag;
-void *tensorrt_dso_handle;
+void* tensorrt_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+
+#if !defined(_WIN32)
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+  void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "TensorRT dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if TensorRT "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure TensorRT dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
+#if !defined(_WIN32)
+    auto errorno = dlerror();
+#else
+    auto errorno = GetLastError();
+#endif  // !_WIN32
+    std::cerr << string::Sprintf(error_msg, dso_name, errorno);
+  }
+
+  return dso_handle;
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 566f887014b..60e299385d6 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -27,6 +27,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
+void* GetTensorRtHandle();
+
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;
 
@@ -36,8 +38,7 @@ extern void* tensorrt_dso_handle;
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
       using tensorrt_func = decltype(&::__name);                              \
       std::call_once(tensorrt_dso_flag, []() {                                \
-        tensorrt_dso_handle =                                                 \
-            paddle::platform::dynload::GetTensorRtDsoHandle();                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
         PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
                                 platform::errors::Unavailable(                \
                                     "Load tensorrt %s failed", #__name));     \
@@ -53,7 +54,8 @@ extern void* tensorrt_dso_handle;
 
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);
+  __macro(createInferRuntime_INTERNAL);     \
+  __macro(getPluginRegistry);
 
 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
 
-- 
GitLab