[Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1) (#46230)

* [Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1): add some funtion for embedding

[Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1) (#46230)
* [Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1): add some funtion for embedding
3441e5e8 · Wangzheee · GitHub · 264ad205 · 3441e5e8 · 3441e5e8
7 changed file
--- a/paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
+#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
+
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+#include "NvInfer.h"
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/common/plugin.h"
+
+#define TRT_UNUSED (void)
+#define BERT_PRINT_DEBUG_MSG 0
+#if BERT_PRINT_DEBUG_MSG
+#define TRANSFORMER_DEBUG_MSG(msg) (gLogVerbose << (msg) << std::endl)
+#define BERT_DEBUG_VALUE(key, value) (gLogVerbose << key << value << std::endl)
+#else
+#define TRANSFORMER_DEBUG_MSG(msg) TRT_UNUSED(msg)
+#define BERT_DEBUG_VALUE(key, value) \
+  TRT_UNUSED(key);                   \
+  TRT_UNUSED(value)
+#endif
+using half = __half;
+constexpr uint32_t BDIM = 1;  // batch dimension
+constexpr uint32_t SDIM = 0;  // seq len dimension
+constexpr uint32_t HDIM = 2;  // hidden dimension
+constexpr int32_t kSM_53 = 53;
+constexpr int32_t kSM_70 = 70;
+constexpr int32_t kSM_72 = 72;
+constexpr int32_t kSM_75 = 75;
+constexpr int32_t kSM_80 = 80;
+constexpr int32_t kSM_86 = 86;
+constexpr int32_t kSM_87 = 87;
+constexpr size_t threadsPerCta128 = 2 * 2 * 32;
+constexpr size_t threadsPerCta384 = 1 * 8 * 32;
+
+// The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M
+// dimension: (s + 16*warps_m - 1) / (16*warps_m);
+constexpr size_t xmmasM128 = 4;
+constexpr size_t xmmasM384 = 24;
+
+// Packed mask size per batch. Layout is XMMAS_M * THREADS_PER_CTA.
+constexpr size_t unfusedMaskSize = 1;
+constexpr size_t packedMaskSize64 = xmmasM128 * threadsPerCta128;
+constexpr size_t packedMaskSize96 = xmmasM128 * threadsPerCta128;
+constexpr size_t packedMaskSize128 = xmmasM128 * threadsPerCta128;
+constexpr size_t packedMaskSize384 = xmmasM384 * threadsPerCta384;
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+inline uint32_t getElementSize(nvinfer1::DataType t) noexcept {
+  switch (t) {
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8:
+      return 1;
+  }
+  return 0;
+}
+
+inline int64_t getWeightsSize(const nvinfer1::Weights& w,
+                              nvinfer1::DataType type) {
+  return w.count * getElementSize(type);
+}
+
+template <typename T>
+inline void serFromDev(char** buffer, const T* data, size_t nbElem) {
+  const size_t len = sizeof(T) * nbElem;
+  cudaMemcpy(
+      buffer, static_cast<const void*>(data), len, cudaMemcpyDeviceToHost);
+  buffer += len;
+}
+
+template <typename T>
+struct CudaDeleter {
+  void operator()(T* buf) { cudaFree(buf); }
+};
+
+template <typename T>
+using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter<T>>;
+
+template <typename T>
+using cuda_shared_ptr = std::shared_ptr<T>;
+
+template <typename T>
+void make_cuda_shared(cuda_shared_ptr<T>* ptr, void* cudaMem) {
+  ptr->reset(static_cast<T*>(cudaMem), CudaDeleter<T>());
+}
+
+struct WeightsWithOwnership : public nvinfer1::Weights {
+  WeightsWithOwnership() {
+    values = nullptr;
+    count = 0;
+  }
+  ~WeightsWithOwnership() { operator delete[](const_cast<void*>(values)); }
+
+  WeightsWithOwnership(const WeightsWithOwnership&) = delete;
+  WeightsWithOwnership operator=(const WeightsWithOwnership&) = delete;
+  WeightsWithOwnership(const WeightsWithOwnership&&) = delete;
+  WeightsWithOwnership operator=(const WeightsWithOwnership&&) = delete;
+
+  void convertAndCopy(const nvinfer1::Weights& src, nvinfer1::DataType type) {
+    this->type = type;
+    this->count = src.count;
+    if (type == nvinfer1::DataType::kFLOAT) {
+      auto destBuf = new float[src.count];
+      this->values = destBuf;
+      if (src.type == nvinfer1::DataType::kFLOAT) {
+        TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Float Array(Host)");
+        std::copy_n(static_cast<const float*>(src.values), src.count, destBuf);
+      } else {
+        assert(src.type == nvinfer1::DataType::kHALF);
+        TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Float Array(Host)");
+        const auto s = static_cast<const half*>(src.values);
+        auto d = static_cast<float*>(const_cast<void*>(this->values));
+        for (auto it = 0; it < src.count; it++) {
+          d[it] = __half2float(s[it]);
+        }
+      }
+    } else if (type == nvinfer1::DataType::kHALF) {
+      auto destBuf = new half[src.count];
+      this->values = destBuf;
+      if (src.type == nvinfer1::DataType::kHALF) {
+        TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Half Array(Host)");
+        std::copy_n(static_cast<const half*>(src.values), src.count, destBuf);
+      } else {
+        assert(src.type == nvinfer1::DataType::kFLOAT);
+        TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Half Array(Host)");
+        const auto s = static_cast<const float*>(src.values);
+        auto d = static_cast<half*>(const_cast<void*>(this->values));
+        for (auto it = 0; it < src.count; it++) {
+          d[it] = __float2half(s[it]);
+        }
+      }
+    } else {
+      throw std::runtime_error("Unsupported DataType specified for plugin.");
+    }
+  }
+
+  void convertAndCopy(const char** srcBuf,
+                      size_t count,
+                      nvinfer1::DataType type) noexcept {
+    this->type = type;
+    this->count = count;
+    const auto nbBytes = getWeightsSize(*this, type);
+    auto destBuf = new char[nbBytes];
+    this->values = destBuf;
+    std::copy_n(srcBuf, nbBytes, destBuf);
+    srcBuf += nbBytes;
+  }
+};
+
+template <typename T>
+inline void copyToDevice(WeightsWithOwnership* hostWeights,
+                         size_t nbBytes,
+                         cuda_unique_ptr<T>* cudaWeights) {
+  if (hostWeights->values) {
+    void* cudaMem{nullptr};
+    cudaMalloc(&cudaMem, nbBytes);
+    cudaMemcpy(cudaMem, hostWeights->values, nbBytes, cudaMemcpyHostToDevice);
+    cudaWeights->reset(static_cast<T*>(cudaMem));
+  }
+}
+
+inline nvinfer1::DataType fieldTypeToDataType(
+    const nvinfer1::PluginFieldType ftype) {
+  switch (ftype) {
+    case nvinfer1::PluginFieldType::kFLOAT32: {
+      TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32");
+      return nvinfer1::DataType::kFLOAT;
+    }
+    case nvinfer1::PluginFieldType::kFLOAT16: {
+      TRANSFORMER_DEBUG_MSG("PluginFieldType is Float16");
+      return nvinfer1::DataType::kHALF;
+    }
+    case nvinfer1::PluginFieldType::kINT32: {
+      TRANSFORMER_DEBUG_MSG("PluginFieldType is Int32");
+      return nvinfer1::DataType::kINT32;
+    }
+    case nvinfer1::PluginFieldType::kINT8: {
+      TRANSFORMER_DEBUG_MSG("PluginFieldType is Int8");
+      return nvinfer1::DataType::kINT8;
+    }
+    default:
+      TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32");
+      return nvinfer1::DataType::kFLOAT;
+  }
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+#endif  // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
--- a/paddle/fluid/inference/tensorrt/plugin/common/common.cuh
+++ b/paddle/fluid/inference/tensorrt/plugin/common/common.cuh
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef COMMON_CUH
+#define COMMON_CUH
+
+#include "cublas_v2.h"
+#include <cub/cub.cuh>
+
+using kv_float = cub::KeyValuePair<float, float>;
+using kv_half = cub::KeyValuePair<half, half>;
+using kv_half2 = cub::KeyValuePair<half2, half2>;
+
+template <typename T>
+__device__ inline T rsqrt(const T& x);
+
+template <>
+__device__ inline float rsqrt(const float& x) {
+    return rsqrtf(x);
+}
+
+__device__ inline kv_float operator+(const kv_float& a, const kv_float& b) {
+    return kv_float(a.key + b.key, a.value + b.value);
+}
+
+// Half Operations
+__device__ inline half2 __hadd2_with_fallback(const half2 a, const half2 b) {
+#if __CUDA_ARCH__ >= 530
+    return __hadd2(a, b);
+#else
+    float2 out {};
+    out.x = __half2float(a.x) + __half2float(b.x);
+    out.y = __half2float(a.y) + __half2float(b.y);
+    return __float22half2_rn(out);
+#endif
+}
+#if __CUDA_ARCH__ < 530
+template <typename T>
+__device__ inline T operator+(const T& a, const T& b);
+template <typename T>
+__device__ inline T operator*(const T& a, const T& b);
+template <>
+__device__ inline half2 operator+(const half2& a, const half2& b) {
+    return __hadd2_with_fallback(a, b);
+}
+template <>
+__device__ inline half2 operator*(const half2& a, const half2& b) {
+    float2 out {};
+    out.x = __half2float(a.x) * __half2float(b.x);
+    out.y = __half2float(a.y) * __half2float(b.y);
+    return __float22half2_rn(out);
+}
+template <typename T>
+__device__ inline T operator+(const T& a, const T& b);
+template <typename T>
+__device__ inline T operator/(const T& a, const T& b);
+template <typename T>
+__device__ inline T& operator+=(T& a, const T& b);
+template <typename T>
+__device__ inline T operator-(const T& a, const T& b);
+template <typename T>
+__device__ inline T operator*(const T& a, const T& b);
+template <>
+__device__ inline half operator+(const half& a, const half& b) {
+    return __float2half(__half2float(a) + __half2float(b));
+}
+template <>
+__device__ inline half& operator+=(half& a, const half& b) {
+    a = __float2half(__half2float(a) + __half2float(b));
+    return a;
+}
+template <>
+__device__ inline half operator-(const half& a, const half& b) {
+    return __float2half(__half2float(a) - __half2float(b));
+}
+template <>
+__device__ inline half operator*(const half& a, const half& b) {
+    return __float2half(__half2float(a) * __half2float(b));
+}
+template <>
+__device__ inline half operator/(const half& a, const half& b) {
+    return __float2half(__half2float(a) / __half2float(b));
+}
+#endif
+
+template <>
+__device__ inline half rsqrt(const half& x) {
+#if __CUDA_ARCH__ >= 530
+    return hrsqrt(x);
+#else
+    return __float2half(rsqrt(__half2float(x)));
+#endif
+}
+
+__device__ inline kv_half operator+(const kv_half& a, const kv_half& b) {
+    const half2 a2 = __halves2half2(a.key, a.value);
+    const half2 b2 = __halves2half2(b.key, b.value);
+    const half2 res = __hadd2_with_fallback(a2, b2);
+    return kv_half(res.x, res.y);
+}
+
+__device__ inline kv_half2 operator+(const kv_half2& a, const kv_half2& b) {
+    return kv_half2(__hadd2_with_fallback(a.key, b.key), __hadd2_with_fallback(a.value, b.value));
+}
+// Helper Functions
+template <typename T>
+using kvp = cub::KeyValuePair<T, T>;
+template <typename T, typename R, typename P, int TPB>
+__device__ inline void layerNorm(
+    const kvp<R>& threadData, const int ld, const int offset, const P* beta, const P* gamma, T* output) {
+    // Assuming threadData is already divided by ld
+    using BlockReduce = cub::BlockReduce<kvp<R>, TPB>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ R mu;     // mean
+    __shared__ R rsigma; // 1 / std.dev.
+    const auto sumKV = BlockReduce(temp_storage).Reduce(threadData, cub::Sum());
+    if (threadIdx.x == 0) {
+        mu = sumKV.key;
+        rsigma = rsqrt(sumKV.value - mu * mu);
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < ld; i += TPB) {
+        const int idx = offset + i;
+        const R val = output[idx];
+        const R g(gamma[i]);
+        const R b(beta[i]);
+        output[idx] = g * (val - mu) * rsigma + b;
+    }
+}
+
+#endif // #ifndef COMMON_CUH
--- a/paddle/fluid/inference/tensorrt/plugin/common/plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/plugin.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
+#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
+#include <cuda_runtime.h>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+
+typedef enum {
+  STATUS_SUCCESS = 0,
+  STATUS_FAILURE = 1,
+  STATUS_BAD_PARAM = 2,
+  STATUS_NOT_SUPPORTED = 3,
+  STATUS_NOT_INITIALIZED = 4
+} pluginStatus_t;
+
+namespace nvinfer1 {
+
+class BasePlugin : public IPluginV2 {
+ protected:
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    mNamespace = libNamespace;
+  }
+  const char* getPluginNamespace() const noexcept override {
+    return mNamespace.c_str();
+  }
+  std::string mNamespace;
+};
+
+class BaseCreator : public IPluginCreator {
+ public:
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    mNamespace = libNamespace;
+  }
+  const char* getPluginNamespace() const noexcept override {
+    return mNamespace.c_str();
+  }
+
+ protected:
+  std::string mNamespace;
+};
+
+}  // namespace nvinfer1
+#endif  // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
--- a/paddle/fluid/inference/tensorrt/plugin/common/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/serialize.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+#include <iostream>
+using std::cerr;
+using std::cout;
+using std::endl;
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer,
+                              size_t* buffer_size,
+                              T* value);
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const&) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::snprintf(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer,
+                          size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer,
+                          size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer,
+                              size_t* buffer_size,
+                              T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
--- a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "NvInfer.h"
+#include "common/bertCommon.h"
+#include "common/common.cuh"
+#include "common/plugin.h"
+#include "common/serialize.h"
+// #include
+// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <typename T, unsigned TPB>
+__global__ void embLayerNormKernelHFace(int32_t ld,
+                                        int32_t** inputIds,
+                                        int32_t const nbLookupTables,
+                                        float const* beta,
+                                        float const* gamma,
+                                        T** mIdsEmbDev,
+                                        int32_t* IdsSize,
+                                        T* output) {
+  cub::Sum pairSum;
+  int32_t const s = blockIdx.x;
+  int32_t const b = blockIdx.y;
+  int32_t* cuSeqlens = inputIds[0];
+  int32_t const sumS = cuSeqlens[b];
+  int32_t const s_b = cuSeqlens[b + 1] - sumS;
+  if (s >= s_b) {
+    return;  // This CTA has nothing to do
+  }
+  T const rld = T(1.f) / T(ld);
+  int32_t const seqPos = sumS + s;
+  extern __shared__ int32_t word_id[];
+
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < nbLookupTables; ++i) {
+      if (static_cast<int32_t const*>(inputIds[i])[seqPos] < 0 ||
+          static_cast<int32_t const*>(inputIds[i])[seqPos] >= IdsSize[i]) {
+        printf(
+            "Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
+            "table: ID < 0 or ID > max ");
+        return;
+      } else {
+        word_id[i - 1] = static_cast<int32_t const*>(inputIds[i])[seqPos];
+      }
+    }
+  }
+  __syncthreads();
+
+  // 2. load pos/tok/word embeddings and add them toghether
+  // offset into embeddings is given by wordId * hidden_size
+  int32_t const poffset = blockIdx.x * ld;
+  int32_t const outOffset = seqPos * ld;
+  // the output offset is given by b * (S*hidden_size) + s * hidden_size
+  kvp<T> threadData(0, 0);
+
+  for (int32_t it = threadIdx.x; it < ld; it += TPB) {
+    T p(mIdsEmbDev[0][poffset + it]);  // pos id
+    T val = p;
+    for (int i = 1; i < nbLookupTables; ++i) {
+      int32_t const offset = word_id[i - 1] * ld;
+      val += mIdsEmbDev[i][offset + it];
+    }
+    output[outOffset + it] = val;
+
+    T const rldval = rld * val;
+    threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
+  }
+
+  // 3. layer norm on the sum
+  layerNorm<T, T, float, TPB>(threadData, ld, outOffset, beta, gamma, output);
+}
+
+template <typename T>
+int32_t embSkipLayerNormHFace(cudaStream_t stream,
+                              int32_t ld,
+                              int32_t B,
+                              int32_t S,
+                              int32_t** inputIds,
+                              int32_t const nbLookupTables,
+                              float const* beta,
+                              float const* gamma,
+                              T** mIdsEmbDev,
+                              int32_t* IdsSize,
+                              T* output) {
+  constexpr int32_t tpb = 256;
+  dim3 const grid(S, B, 1);
+  dim3 const block(tpb, 1, 1);
+  size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1);
+  embLayerNormKernelHFace<T, tpb><<<grid, block, cache_size, stream>>>(
+      ld, inputIds, nbLookupTables, beta, gamma, mIdsEmbDev, IdsSize, output);
+  return cudaPeekAtLastError();
+}
+
+template int32_t embSkipLayerNormHFace<float>(cudaStream_t,
+                                              int32_t,
+                                              int32_t,
+                                              int32_t,
+                                              int32_t**,
+                                              int32_t const,
+                                              float const*,
+                                              float const*,
+                                              float**,
+                                              int32_t*,
+                                              float*);
+
+template int32_t embSkipLayerNormHFace<half>(cudaStream_t,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t**,
+                                             int32_t const,
+                                             float const*,
+                                             float const*,
+                                             half**,
+                                             int32_t*,
+                                             half*);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "NvInfer.h"
+#include "common/bertCommon.h"
+#include "common/common.cuh"
+#include "common/plugin.h"
+#include "common/serialize.h"
+// #include
+// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <typename T, unsigned TPB>
+__global__ void embLayerNormKernelMTron(int32_t ld,
+                                        int32_t** inputIds,
+                                        int32_t const nbLookupTables,
+                                        float const* beta,
+                                        float const* gamma,
+                                        T** mIdsEmbDev,
+                                        int32_t* IdsSize,
+                                        T* output,
+                                        T* skip) {
+  cub::Sum pairSum;
+  int32_t const s = blockIdx.x;
+  int32_t const b = blockIdx.y;
+  int32_t* cuSeqlens = inputIds[0];
+  int32_t const sumS = cuSeqlens[b];
+  int32_t const s_b = cuSeqlens[b + 1] - sumS;
+  if (s >= s_b) {
+    return;  // This CTA has nothing to do
+  }
+  T const rld = T(1.f) / T(ld);
+  int32_t const seqPos = sumS + s;
+  extern __shared__ int32_t word_id[];
+
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < nbLookupTables; ++i) {
+      if (static_cast<int32_t const*>(inputIds[i])[seqPos] < 0 ||
+          static_cast<int32_t const*>(inputIds[i])[seqPos] >= IdsSize[i]) {
+        printf(
+            "Error !!!!!!!!!!!!!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot "
+            "be lookup table: ID < 0 or ID > max ");
+        return;
+      } else {
+        word_id[i - 1] = static_cast<int32_t const*>(inputIds[i])[seqPos];
+      }
+    }
+  }
+  __syncthreads();
+
+  // 2. load pos/tok/word embeddings and add them toghether
+  // offset into embeddings is given by wordId * hidden_size
+  int32_t const poffset = blockIdx.x * ld;
+  int32_t const outOffset = seqPos * ld;
+  // the output offset is given by b * (S*hidden_size) + s * hidden_size
+  kvp<T> threadData(0, 0);
+
+  for (int32_t it = threadIdx.x; it < ld; it += TPB) {
+    T p(mIdsEmbDev[0][poffset + it]);  // pos id
+    T val = p;
+    for (int i = 1; i < nbLookupTables; ++i) {
+      int32_t const offset = word_id[i - 1] * ld;
+      val += mIdsEmbDev[i][offset + it];
+    }
+    output[outOffset + it] = val;
+    skip[outOffset + it] = val;
+
+    T const rldval = rld * val;
+    threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
+  }
+
+  // 3. layer norm on the sum
+  layerNorm<T, T, float, TPB>(threadData, ld, outOffset, beta, gamma, output);
+}
+
+template <typename T>
+int32_t embSkipLayerNormMTron(cudaStream_t stream,
+                              int32_t ld,
+                              int32_t B,
+                              int32_t S,
+                              int32_t** inputIds,
+                              int32_t const nbLookupTables,
+                              float const* beta,
+                              float const* gamma,
+                              T** mIdsEmbDev,
+                              int32_t* IdsSize,
+                              T* output,
+                              T* skip) {
+  constexpr int32_t tpb = 256;
+  dim3 const grid(S, B, 1);
+  dim3 const block(tpb, 1, 1);
+  size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1);
+  embLayerNormKernelMTron<T, tpb>
+      <<<grid, block, cache_size, stream>>>(ld,
+                                            inputIds,
+                                            nbLookupTables,
+                                            beta,
+                                            gamma,
+                                            mIdsEmbDev,
+                                            IdsSize,
+                                            output,
+                                            skip);
+  return cudaPeekAtLastError();
+}
+
+template int32_t embSkipLayerNormMTron<float>(cudaStream_t,
+                                              int32_t,
+                                              int32_t,
+                                              int32_t,
+                                              int32_t**,
+                                              int32_t const,
+                                              float const*,
+                                              float const*,
+                                              float**,
+                                              int32_t*,
+                                              float*,
+                                              float*);
+
+template int32_t embSkipLayerNormMTron<half>(cudaStream_t,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t,
+                                             int32_t**,
+                                             int32_t const,
+                                             float const*,
+                                             float const*,
+                                             half**,
+                                             int32_t*,
+                                             half*,
+                                             half*);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -135,7 +135,12 @@ static void trt_ernie(bool with_fp16, std::vector<float> result) {
  if (with_fp16) {
    precision = AnalysisConfig::Precision::kHalf;
  }
+
+#if defined _WIN32
+#else
  config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
+#endif
+
  config.SetTRTDynamicShapeInfo(
      min_input_shape, max_input_shape, opt_input_shape);
  AnalysisConfig* config_deser = new AnalysisConfig(config);