diff --git a/paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h b/paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..3783b197ae03db7bd9b179cdefe0885623fddd6e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h @@ -0,0 +1,224 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & +// AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_ +#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "NvInferRuntimeCommon.h" +#include "paddle/fluid/inference/tensorrt/plugin/common/plugin.h" + +#define TRT_UNUSED (void) +#define BERT_PRINT_DEBUG_MSG 0 +#if BERT_PRINT_DEBUG_MSG +#define TRANSFORMER_DEBUG_MSG(msg) (gLogVerbose << (msg) << std::endl) +#define BERT_DEBUG_VALUE(key, value) (gLogVerbose << key << value << std::endl) +#else +#define TRANSFORMER_DEBUG_MSG(msg) TRT_UNUSED(msg) +#define BERT_DEBUG_VALUE(key, value) \ + TRT_UNUSED(key); \ + TRT_UNUSED(value) +#endif +using half = __half; +constexpr uint32_t BDIM = 1; // batch dimension +constexpr uint32_t SDIM = 0; // seq len dimension +constexpr uint32_t HDIM = 2; // hidden dimension +constexpr int32_t kSM_53 = 53; +constexpr int32_t kSM_70 = 70; +constexpr int32_t kSM_72 = 72; +constexpr int32_t kSM_75 = 75; +constexpr int32_t kSM_80 = 80; +constexpr int32_t kSM_86 = 86; +constexpr int32_t kSM_87 = 87; +constexpr size_t threadsPerCta128 = 2 * 2 * 32; +constexpr size_t threadsPerCta384 = 1 * 8 * 32; + +// The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M +// dimension: (s + 16*warps_m - 1) / (16*warps_m); +constexpr size_t xmmasM128 = 4; +constexpr size_t xmmasM384 = 24; + +// Packed mask size per batch. Layout is XMMAS_M * THREADS_PER_CTA. +constexpr size_t unfusedMaskSize = 1; +constexpr size_t packedMaskSize64 = xmmasM128 * threadsPerCta128; +constexpr size_t packedMaskSize96 = xmmasM128 * threadsPerCta128; +constexpr size_t packedMaskSize128 = xmmasM128 * threadsPerCta128; +constexpr size_t packedMaskSize384 = xmmasM384 * threadsPerCta384; + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +inline uint32_t getElementSize(nvinfer1::DataType t) noexcept { + switch (t) { + case nvinfer1::DataType::kINT32: + return 4; + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kINT8: + return 1; + } + return 0; +} + +inline int64_t getWeightsSize(const nvinfer1::Weights& w, + nvinfer1::DataType type) { + return w.count * getElementSize(type); +} + +template +inline void serFromDev(char** buffer, const T* data, size_t nbElem) { + const size_t len = sizeof(T) * nbElem; + cudaMemcpy( + buffer, static_cast(data), len, cudaMemcpyDeviceToHost); + buffer += len; +} + +template +struct CudaDeleter { + void operator()(T* buf) { cudaFree(buf); } +}; + +template +using cuda_unique_ptr = std::unique_ptr>; + +template +using cuda_shared_ptr = std::shared_ptr; + +template +void make_cuda_shared(cuda_shared_ptr* ptr, void* cudaMem) { + ptr->reset(static_cast(cudaMem), CudaDeleter()); +} + +struct WeightsWithOwnership : public nvinfer1::Weights { + WeightsWithOwnership() { + values = nullptr; + count = 0; + } + ~WeightsWithOwnership() { operator delete[](const_cast(values)); } + + WeightsWithOwnership(const WeightsWithOwnership&) = delete; + WeightsWithOwnership operator=(const WeightsWithOwnership&) = delete; + WeightsWithOwnership(const WeightsWithOwnership&&) = delete; + WeightsWithOwnership operator=(const WeightsWithOwnership&&) = delete; + + void convertAndCopy(const nvinfer1::Weights& src, nvinfer1::DataType type) { + this->type = type; + this->count = src.count; + if (type == nvinfer1::DataType::kFLOAT) { + auto destBuf = new float[src.count]; + this->values = destBuf; + if (src.type == nvinfer1::DataType::kFLOAT) { + TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Float Array(Host)"); + std::copy_n(static_cast(src.values), src.count, destBuf); + } else { + assert(src.type == nvinfer1::DataType::kHALF); + TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Float Array(Host)"); + const auto s = static_cast(src.values); + auto d = static_cast(const_cast(this->values)); + for (auto it = 0; it < src.count; it++) { + d[it] = __half2float(s[it]); + } + } + } else if (type == nvinfer1::DataType::kHALF) { + auto destBuf = new half[src.count]; + this->values = destBuf; + if (src.type == nvinfer1::DataType::kHALF) { + TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Half Array(Host)"); + std::copy_n(static_cast(src.values), src.count, destBuf); + } else { + assert(src.type == nvinfer1::DataType::kFLOAT); + TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Half Array(Host)"); + const auto s = static_cast(src.values); + auto d = static_cast(const_cast(this->values)); + for (auto it = 0; it < src.count; it++) { + d[it] = __float2half(s[it]); + } + } + } else { + throw std::runtime_error("Unsupported DataType specified for plugin."); + } + } + + void convertAndCopy(const char** srcBuf, + size_t count, + nvinfer1::DataType type) noexcept { + this->type = type; + this->count = count; + const auto nbBytes = getWeightsSize(*this, type); + auto destBuf = new char[nbBytes]; + this->values = destBuf; + std::copy_n(srcBuf, nbBytes, destBuf); + srcBuf += nbBytes; + } +}; + +template +inline void copyToDevice(WeightsWithOwnership* hostWeights, + size_t nbBytes, + cuda_unique_ptr* cudaWeights) { + if (hostWeights->values) { + void* cudaMem{nullptr}; + cudaMalloc(&cudaMem, nbBytes); + cudaMemcpy(cudaMem, hostWeights->values, nbBytes, cudaMemcpyHostToDevice); + cudaWeights->reset(static_cast(cudaMem)); + } +} + +inline nvinfer1::DataType fieldTypeToDataType( + const nvinfer1::PluginFieldType ftype) { + switch (ftype) { + case nvinfer1::PluginFieldType::kFLOAT32: { + TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32"); + return nvinfer1::DataType::kFLOAT; + } + case nvinfer1::PluginFieldType::kFLOAT16: { + TRANSFORMER_DEBUG_MSG("PluginFieldType is Float16"); + return nvinfer1::DataType::kHALF; + } + case nvinfer1::PluginFieldType::kINT32: { + TRANSFORMER_DEBUG_MSG("PluginFieldType is Int32"); + return nvinfer1::DataType::kINT32; + } + case nvinfer1::PluginFieldType::kINT8: { + TRANSFORMER_DEBUG_MSG("PluginFieldType is Int8"); + return nvinfer1::DataType::kINT8; + } + default: + TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32"); + return nvinfer1::DataType::kFLOAT; + } +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +#endif // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_ diff --git a/paddle/fluid/inference/tensorrt/plugin/common/common.cuh b/paddle/fluid/inference/tensorrt/plugin/common/common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6e155de44d095fe7c55f86479b6b53f0a6263541 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/common/common.cuh @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef COMMON_CUH +#define COMMON_CUH + +#include "cublas_v2.h" +#include + +using kv_float = cub::KeyValuePair; +using kv_half = cub::KeyValuePair; +using kv_half2 = cub::KeyValuePair; + +template +__device__ inline T rsqrt(const T& x); + +template <> +__device__ inline float rsqrt(const float& x) { + return rsqrtf(x); +} + +__device__ inline kv_float operator+(const kv_float& a, const kv_float& b) { + return kv_float(a.key + b.key, a.value + b.value); +} + +// Half Operations +__device__ inline half2 __hadd2_with_fallback(const half2 a, const half2 b) { +#if __CUDA_ARCH__ >= 530 + return __hadd2(a, b); +#else + float2 out {}; + out.x = __half2float(a.x) + __half2float(b.x); + out.y = __half2float(a.y) + __half2float(b.y); + return __float22half2_rn(out); +#endif +} +#if __CUDA_ARCH__ < 530 +template +__device__ inline T operator+(const T& a, const T& b); +template +__device__ inline T operator*(const T& a, const T& b); +template <> +__device__ inline half2 operator+(const half2& a, const half2& b) { + return __hadd2_with_fallback(a, b); +} +template <> +__device__ inline half2 operator*(const half2& a, const half2& b) { + float2 out {}; + out.x = __half2float(a.x) * __half2float(b.x); + out.y = __half2float(a.y) * __half2float(b.y); + return __float22half2_rn(out); +} +template +__device__ inline T operator+(const T& a, const T& b); +template +__device__ inline T operator/(const T& a, const T& b); +template +__device__ inline T& operator+=(T& a, const T& b); +template +__device__ inline T operator-(const T& a, const T& b); +template +__device__ inline T operator*(const T& a, const T& b); +template <> +__device__ inline half operator+(const half& a, const half& b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> +__device__ inline half& operator+=(half& a, const half& b) { + a = __float2half(__half2float(a) + __half2float(b)); + return a; +} +template <> +__device__ inline half operator-(const half& a, const half& b) { + return __float2half(__half2float(a) - __half2float(b)); +} +template <> +__device__ inline half operator*(const half& a, const half& b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> +__device__ inline half operator/(const half& a, const half& b) { + return __float2half(__half2float(a) / __half2float(b)); +} +#endif + +template <> +__device__ inline half rsqrt(const half& x) { +#if __CUDA_ARCH__ >= 530 + return hrsqrt(x); +#else + return __float2half(rsqrt(__half2float(x))); +#endif +} + +__device__ inline kv_half operator+(const kv_half& a, const kv_half& b) { + const half2 a2 = __halves2half2(a.key, a.value); + const half2 b2 = __halves2half2(b.key, b.value); + const half2 res = __hadd2_with_fallback(a2, b2); + return kv_half(res.x, res.y); +} + +__device__ inline kv_half2 operator+(const kv_half2& a, const kv_half2& b) { + return kv_half2(__hadd2_with_fallback(a.key, b.key), __hadd2_with_fallback(a.value, b.value)); +} +// Helper Functions +template +using kvp = cub::KeyValuePair; +template +__device__ inline void layerNorm( + const kvp& threadData, const int ld, const int offset, const P* beta, const P* gamma, T* output) { + // Assuming threadData is already divided by ld + using BlockReduce = cub::BlockReduce, TPB>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ R mu; // mean + __shared__ R rsigma; // 1 / std.dev. + const auto sumKV = BlockReduce(temp_storage).Reduce(threadData, cub::Sum()); + if (threadIdx.x == 0) { + mu = sumKV.key; + rsigma = rsqrt(sumKV.value - mu * mu); + } + __syncthreads(); + for (int i = threadIdx.x; i < ld; i += TPB) { + const int idx = offset + i; + const R val = output[idx]; + const R g(gamma[i]); + const R b(beta[i]); + output[idx] = g * (val - mu) * rsigma + b; + } +} + +#endif // #ifndef COMMON_CUH diff --git a/paddle/fluid/inference/tensorrt/plugin/common/plugin.h b/paddle/fluid/inference/tensorrt/plugin/common/plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..de8d7dc2deafb0821c822bd21659af63b581e53b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/common/plugin.h @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & +// AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_ +#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_ +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "NvInferPlugin.h" + +typedef enum { + STATUS_SUCCESS = 0, + STATUS_FAILURE = 1, + STATUS_BAD_PARAM = 2, + STATUS_NOT_SUPPORTED = 3, + STATUS_NOT_INITIALIZED = 4 +} pluginStatus_t; + +namespace nvinfer1 { + +class BasePlugin : public IPluginV2 { + protected: + void setPluginNamespace(const char* libNamespace) noexcept override { + mNamespace = libNamespace; + } + const char* getPluginNamespace() const noexcept override { + return mNamespace.c_str(); + } + std::string mNamespace; +}; + +class BaseCreator : public IPluginCreator { + public: + void setPluginNamespace(const char* libNamespace) noexcept override { + mNamespace = libNamespace; + } + const char* getPluginNamespace() const noexcept override { + return mNamespace.c_str(); + } + + protected: + std::string mNamespace; +}; + +} // namespace nvinfer1 +#endif // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_ diff --git a/paddle/fluid/inference/tensorrt/plugin/common/serialize.h b/paddle/fluid/inference/tensorrt/plugin/common/serialize.h new file mode 100644 index 0000000000000000000000000000000000000000..b51528cb5abf0a5d2dc0721f7c0899800f227278 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/common/serialize.h @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & +// AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +using std::cerr; +using std::cout; +using std::endl; + +template +inline void serialize_value(void** buffer, T const& value); + +template +inline void deserialize_value(void const** buffer, + size_t* buffer_size, + T* value); + +template +struct Serializer {}; + +template +struct Serializer::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t serialized_size(T const&) { return sizeof(T); } + static void serialize(void** buffer, T const& value) { + ::memcpy(*buffer, &value, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + } + static void deserialize(void const** buffer, size_t* buffer_size, T* value) { + assert(*buffer_size >= sizeof(T)); + ::memcpy(value, *buffer, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + *buffer_size -= sizeof(T); + } +}; + +template <> +struct Serializer { + static size_t serialized_size(const char* value) { return strlen(value) + 1; } + static void serialize(void** buffer, const char* value) { + ::snprintf(static_cast(*buffer), value); + reinterpret_cast(*buffer) += strlen(value) + 1; + } + static void deserialize(void const** buffer, + size_t* buffer_size, + const char** value) { + *value = static_cast(*buffer); + size_t data_size = strnlen(*value, *buffer_size) + 1; + assert(*buffer_size >= data_size); + reinterpret_cast(*buffer) += data_size; + *buffer_size -= data_size; + } +}; + +template +struct Serializer, + typename std::enable_if::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t serialized_size(std::vector const& value) { + return sizeof(value.size()) + value.size() * sizeof(T); + } + static void serialize(void** buffer, std::vector const& value) { + serialize_value(buffer, value.size()); + size_t nbyte = value.size() * sizeof(T); + ::memcpy(*buffer, value.data(), nbyte); + reinterpret_cast(*buffer) += nbyte; + } + static void deserialize(void const** buffer, + size_t* buffer_size, + std::vector* value) { + size_t size; + deserialize_value(buffer, buffer_size, &size); + value->resize(size); + size_t nbyte = value->size() * sizeof(T); + assert(*buffer_size >= nbyte); + ::memcpy(value->data(), *buffer, nbyte); + reinterpret_cast(*buffer) += nbyte; + *buffer_size -= nbyte; + } +}; + +template +inline size_t serialized_size(T const& value) { + return Serializer::serialized_size(value); +} + +template +inline void serialize_value(void** buffer, T const& value) { + return Serializer::serialize(buffer, value); +} + +template +inline void deserialize_value(void const** buffer, + size_t* buffer_size, + T* value) { + return Serializer::deserialize(buffer, buffer_size, value); +} diff --git a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu new file mode 100644 index 0000000000000000000000000000000000000000..366acbc11e04633a81a12ebc15ea472eebff74b8 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & +// AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "common/bertCommon.h" +#include "common/common.cuh" +#include "common/plugin.h" +#include "common/serialize.h" +// #include +// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +template +__global__ void embLayerNormKernelHFace(int32_t ld, + int32_t** inputIds, + int32_t const nbLookupTables, + float const* beta, + float const* gamma, + T** mIdsEmbDev, + int32_t* IdsSize, + T* output) { + cub::Sum pairSum; + int32_t const s = blockIdx.x; + int32_t const b = blockIdx.y; + int32_t* cuSeqlens = inputIds[0]; + int32_t const sumS = cuSeqlens[b]; + int32_t const s_b = cuSeqlens[b + 1] - sumS; + if (s >= s_b) { + return; // This CTA has nothing to do + } + T const rld = T(1.f) / T(ld); + int32_t const seqPos = sumS + s; + extern __shared__ int32_t word_id[]; + + if (threadIdx.x == 0) { + for (int i = 1; i < nbLookupTables; ++i) { + if (static_cast(inputIds[i])[seqPos] < 0 || + static_cast(inputIds[i])[seqPos] >= IdsSize[i]) { + printf( + "Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup " + "table: ID < 0 or ID > max "); + return; + } else { + word_id[i - 1] = static_cast(inputIds[i])[seqPos]; + } + } + } + __syncthreads(); + + // 2. load pos/tok/word embeddings and add them toghether + // offset into embeddings is given by wordId * hidden_size + int32_t const poffset = blockIdx.x * ld; + int32_t const outOffset = seqPos * ld; + // the output offset is given by b * (S*hidden_size) + s * hidden_size + kvp threadData(0, 0); + + for (int32_t it = threadIdx.x; it < ld; it += TPB) { + T p(mIdsEmbDev[0][poffset + it]); // pos id + T val = p; + for (int i = 1; i < nbLookupTables; ++i) { + int32_t const offset = word_id[i - 1] * ld; + val += mIdsEmbDev[i][offset + it]; + } + output[outOffset + it] = val; + + T const rldval = rld * val; + threadData = pairSum(threadData, kvp(rldval, rldval * val)); + } + + // 3. layer norm on the sum + layerNorm(threadData, ld, outOffset, beta, gamma, output); +} + +template +int32_t embSkipLayerNormHFace(cudaStream_t stream, + int32_t ld, + int32_t B, + int32_t S, + int32_t** inputIds, + int32_t const nbLookupTables, + float const* beta, + float const* gamma, + T** mIdsEmbDev, + int32_t* IdsSize, + T* output) { + constexpr int32_t tpb = 256; + dim3 const grid(S, B, 1); + dim3 const block(tpb, 1, 1); + size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1); + embLayerNormKernelHFace<<>>( + ld, inputIds, nbLookupTables, beta, gamma, mIdsEmbDev, IdsSize, output); + return cudaPeekAtLastError(); +} + +template int32_t embSkipLayerNormHFace(cudaStream_t, + int32_t, + int32_t, + int32_t, + int32_t**, + int32_t const, + float const*, + float const*, + float**, + int32_t*, + float*); + +template int32_t embSkipLayerNormHFace(cudaStream_t, + int32_t, + int32_t, + int32_t, + int32_t**, + int32_t const, + float const*, + float const*, + half**, + int32_t*, + half*); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu new file mode 100644 index 0000000000000000000000000000000000000000..d33a3772139b0605c2c42189e339ba4e188110ba --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu @@ -0,0 +1,158 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & +// AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "common/bertCommon.h" +#include "common/common.cuh" +#include "common/plugin.h" +#include "common/serialize.h" +// #include +// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +template +__global__ void embLayerNormKernelMTron(int32_t ld, + int32_t** inputIds, + int32_t const nbLookupTables, + float const* beta, + float const* gamma, + T** mIdsEmbDev, + int32_t* IdsSize, + T* output, + T* skip) { + cub::Sum pairSum; + int32_t const s = blockIdx.x; + int32_t const b = blockIdx.y; + int32_t* cuSeqlens = inputIds[0]; + int32_t const sumS = cuSeqlens[b]; + int32_t const s_b = cuSeqlens[b + 1] - sumS; + if (s >= s_b) { + return; // This CTA has nothing to do + } + T const rld = T(1.f) / T(ld); + int32_t const seqPos = sumS + s; + extern __shared__ int32_t word_id[]; + + if (threadIdx.x == 0) { + for (int i = 1; i < nbLookupTables; ++i) { + if (static_cast(inputIds[i])[seqPos] < 0 || + static_cast(inputIds[i])[seqPos] >= IdsSize[i]) { + printf( + "Error !!!!!!!!!!!!!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot " + "be lookup table: ID < 0 or ID > max "); + return; + } else { + word_id[i - 1] = static_cast(inputIds[i])[seqPos]; + } + } + } + __syncthreads(); + + // 2. load pos/tok/word embeddings and add them toghether + // offset into embeddings is given by wordId * hidden_size + int32_t const poffset = blockIdx.x * ld; + int32_t const outOffset = seqPos * ld; + // the output offset is given by b * (S*hidden_size) + s * hidden_size + kvp threadData(0, 0); + + for (int32_t it = threadIdx.x; it < ld; it += TPB) { + T p(mIdsEmbDev[0][poffset + it]); // pos id + T val = p; + for (int i = 1; i < nbLookupTables; ++i) { + int32_t const offset = word_id[i - 1] * ld; + val += mIdsEmbDev[i][offset + it]; + } + output[outOffset + it] = val; + skip[outOffset + it] = val; + + T const rldval = rld * val; + threadData = pairSum(threadData, kvp(rldval, rldval * val)); + } + + // 3. layer norm on the sum + layerNorm(threadData, ld, outOffset, beta, gamma, output); +} + +template +int32_t embSkipLayerNormMTron(cudaStream_t stream, + int32_t ld, + int32_t B, + int32_t S, + int32_t** inputIds, + int32_t const nbLookupTables, + float const* beta, + float const* gamma, + T** mIdsEmbDev, + int32_t* IdsSize, + T* output, + T* skip) { + constexpr int32_t tpb = 256; + dim3 const grid(S, B, 1); + dim3 const block(tpb, 1, 1); + size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1); + embLayerNormKernelMTron + <<>>(ld, + inputIds, + nbLookupTables, + beta, + gamma, + mIdsEmbDev, + IdsSize, + output, + skip); + return cudaPeekAtLastError(); +} + +template int32_t embSkipLayerNormMTron(cudaStream_t, + int32_t, + int32_t, + int32_t, + int32_t**, + int32_t const, + float const*, + float const*, + float**, + int32_t*, + float*, + float*); + +template int32_t embSkipLayerNormMTron(cudaStream_t, + int32_t, + int32_t, + int32_t, + int32_t**, + int32_t const, + float const*, + float const*, + half**, + int32_t*, + half*, + half*); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h index d54ff66625f3345f9ae4adb6b85c6135893ede21..aa533454fdfeff7e50783dc028af039dab3b3f11 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h @@ -135,7 +135,12 @@ static void trt_ernie(bool with_fp16, std::vector result) { if (with_fp16) { precision = AnalysisConfig::Precision::kHalf; } + +#if defined _WIN32 +#else config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false); +#endif + config.SetTRTDynamicShapeInfo( min_input_shape, max_input_shape, opt_input_shape); AnalysisConfig* config_deser = new AnalysisConfig(config);