未验证 提交 3441e5e8 编写于 作者: W Wangzheee 提交者: GitHub

[Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1) (#46230)

* [Paddle Inference]support n lookup_tables fuse to embeddinglayernorm(1): add some funtion for embedding
上级 264ad205
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
// AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <memory>
#include <numeric>
#include <stdexcept>
#include <vector>
#include "NvInfer.h"
#include "NvInferRuntimeCommon.h"
#include "paddle/fluid/inference/tensorrt/plugin/common/plugin.h"
#define TRT_UNUSED (void)
#define BERT_PRINT_DEBUG_MSG 0
#if BERT_PRINT_DEBUG_MSG
#define TRANSFORMER_DEBUG_MSG(msg) (gLogVerbose << (msg) << std::endl)
#define BERT_DEBUG_VALUE(key, value) (gLogVerbose << key << value << std::endl)
#else
#define TRANSFORMER_DEBUG_MSG(msg) TRT_UNUSED(msg)
#define BERT_DEBUG_VALUE(key, value) \
TRT_UNUSED(key); \
TRT_UNUSED(value)
#endif
using half = __half;
constexpr uint32_t BDIM = 1; // batch dimension
constexpr uint32_t SDIM = 0; // seq len dimension
constexpr uint32_t HDIM = 2; // hidden dimension
constexpr int32_t kSM_53 = 53;
constexpr int32_t kSM_70 = 70;
constexpr int32_t kSM_72 = 72;
constexpr int32_t kSM_75 = 75;
constexpr int32_t kSM_80 = 80;
constexpr int32_t kSM_86 = 86;
constexpr int32_t kSM_87 = 87;
constexpr size_t threadsPerCta128 = 2 * 2 * 32;
constexpr size_t threadsPerCta384 = 1 * 8 * 32;
// The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M
// dimension: (s + 16*warps_m - 1) / (16*warps_m);
constexpr size_t xmmasM128 = 4;
constexpr size_t xmmasM384 = 24;
// Packed mask size per batch. Layout is XMMAS_M * THREADS_PER_CTA.
constexpr size_t unfusedMaskSize = 1;
constexpr size_t packedMaskSize64 = xmmasM128 * threadsPerCta128;
constexpr size_t packedMaskSize96 = xmmasM128 * threadsPerCta128;
constexpr size_t packedMaskSize128 = xmmasM128 * threadsPerCta128;
constexpr size_t packedMaskSize384 = xmmasM384 * threadsPerCta384;
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
inline uint32_t getElementSize(nvinfer1::DataType t) noexcept {
switch (t) {
case nvinfer1::DataType::kINT32:
return 4;
case nvinfer1::DataType::kFLOAT:
return 4;
case nvinfer1::DataType::kHALF:
return 2;
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kINT8:
return 1;
}
return 0;
}
inline int64_t getWeightsSize(const nvinfer1::Weights& w,
nvinfer1::DataType type) {
return w.count * getElementSize(type);
}
template <typename T>
inline void serFromDev(char** buffer, const T* data, size_t nbElem) {
const size_t len = sizeof(T) * nbElem;
cudaMemcpy(
buffer, static_cast<const void*>(data), len, cudaMemcpyDeviceToHost);
buffer += len;
}
template <typename T>
struct CudaDeleter {
void operator()(T* buf) { cudaFree(buf); }
};
template <typename T>
using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter<T>>;
template <typename T>
using cuda_shared_ptr = std::shared_ptr<T>;
template <typename T>
void make_cuda_shared(cuda_shared_ptr<T>* ptr, void* cudaMem) {
ptr->reset(static_cast<T*>(cudaMem), CudaDeleter<T>());
}
struct WeightsWithOwnership : public nvinfer1::Weights {
WeightsWithOwnership() {
values = nullptr;
count = 0;
}
~WeightsWithOwnership() { operator delete[](const_cast<void*>(values)); }
WeightsWithOwnership(const WeightsWithOwnership&) = delete;
WeightsWithOwnership operator=(const WeightsWithOwnership&) = delete;
WeightsWithOwnership(const WeightsWithOwnership&&) = delete;
WeightsWithOwnership operator=(const WeightsWithOwnership&&) = delete;
void convertAndCopy(const nvinfer1::Weights& src, nvinfer1::DataType type) {
this->type = type;
this->count = src.count;
if (type == nvinfer1::DataType::kFLOAT) {
auto destBuf = new float[src.count];
this->values = destBuf;
if (src.type == nvinfer1::DataType::kFLOAT) {
TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Float Array(Host)");
std::copy_n(static_cast<const float*>(src.values), src.count, destBuf);
} else {
assert(src.type == nvinfer1::DataType::kHALF);
TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Float Array(Host)");
const auto s = static_cast<const half*>(src.values);
auto d = static_cast<float*>(const_cast<void*>(this->values));
for (auto it = 0; it < src.count; it++) {
d[it] = __half2float(s[it]);
}
}
} else if (type == nvinfer1::DataType::kHALF) {
auto destBuf = new half[src.count];
this->values = destBuf;
if (src.type == nvinfer1::DataType::kHALF) {
TRANSFORMER_DEBUG_MSG("Half Weights(Host) => Half Array(Host)");
std::copy_n(static_cast<const half*>(src.values), src.count, destBuf);
} else {
assert(src.type == nvinfer1::DataType::kFLOAT);
TRANSFORMER_DEBUG_MSG("Float Weights(Host) => Half Array(Host)");
const auto s = static_cast<const float*>(src.values);
auto d = static_cast<half*>(const_cast<void*>(this->values));
for (auto it = 0; it < src.count; it++) {
d[it] = __float2half(s[it]);
}
}
} else {
throw std::runtime_error("Unsupported DataType specified for plugin.");
}
}
void convertAndCopy(const char** srcBuf,
size_t count,
nvinfer1::DataType type) noexcept {
this->type = type;
this->count = count;
const auto nbBytes = getWeightsSize(*this, type);
auto destBuf = new char[nbBytes];
this->values = destBuf;
std::copy_n(srcBuf, nbBytes, destBuf);
srcBuf += nbBytes;
}
};
template <typename T>
inline void copyToDevice(WeightsWithOwnership* hostWeights,
size_t nbBytes,
cuda_unique_ptr<T>* cudaWeights) {
if (hostWeights->values) {
void* cudaMem{nullptr};
cudaMalloc(&cudaMem, nbBytes);
cudaMemcpy(cudaMem, hostWeights->values, nbBytes, cudaMemcpyHostToDevice);
cudaWeights->reset(static_cast<T*>(cudaMem));
}
}
inline nvinfer1::DataType fieldTypeToDataType(
const nvinfer1::PluginFieldType ftype) {
switch (ftype) {
case nvinfer1::PluginFieldType::kFLOAT32: {
TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32");
return nvinfer1::DataType::kFLOAT;
}
case nvinfer1::PluginFieldType::kFLOAT16: {
TRANSFORMER_DEBUG_MSG("PluginFieldType is Float16");
return nvinfer1::DataType::kHALF;
}
case nvinfer1::PluginFieldType::kINT32: {
TRANSFORMER_DEBUG_MSG("PluginFieldType is Int32");
return nvinfer1::DataType::kINT32;
}
case nvinfer1::PluginFieldType::kINT8: {
TRANSFORMER_DEBUG_MSG("PluginFieldType is Int8");
return nvinfer1::DataType::kINT8;
}
default:
TRANSFORMER_DEBUG_MSG("PluginFieldType is Float32");
return nvinfer1::DataType::kFLOAT;
}
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#endif // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_BERTCOMMON_H_
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef COMMON_CUH
#define COMMON_CUH
#include "cublas_v2.h"
#include <cub/cub.cuh>
using kv_float = cub::KeyValuePair<float, float>;
using kv_half = cub::KeyValuePair<half, half>;
using kv_half2 = cub::KeyValuePair<half2, half2>;
template <typename T>
__device__ inline T rsqrt(const T& x);
template <>
__device__ inline float rsqrt(const float& x) {
return rsqrtf(x);
}
__device__ inline kv_float operator+(const kv_float& a, const kv_float& b) {
return kv_float(a.key + b.key, a.value + b.value);
}
// Half Operations
__device__ inline half2 __hadd2_with_fallback(const half2 a, const half2 b) {
#if __CUDA_ARCH__ >= 530
return __hadd2(a, b);
#else
float2 out {};
out.x = __half2float(a.x) + __half2float(b.x);
out.y = __half2float(a.y) + __half2float(b.y);
return __float22half2_rn(out);
#endif
}
#if __CUDA_ARCH__ < 530
template <typename T>
__device__ inline T operator+(const T& a, const T& b);
template <typename T>
__device__ inline T operator*(const T& a, const T& b);
template <>
__device__ inline half2 operator+(const half2& a, const half2& b) {
return __hadd2_with_fallback(a, b);
}
template <>
__device__ inline half2 operator*(const half2& a, const half2& b) {
float2 out {};
out.x = __half2float(a.x) * __half2float(b.x);
out.y = __half2float(a.y) * __half2float(b.y);
return __float22half2_rn(out);
}
template <typename T>
__device__ inline T operator+(const T& a, const T& b);
template <typename T>
__device__ inline T operator/(const T& a, const T& b);
template <typename T>
__device__ inline T& operator+=(T& a, const T& b);
template <typename T>
__device__ inline T operator-(const T& a, const T& b);
template <typename T>
__device__ inline T operator*(const T& a, const T& b);
template <>
__device__ inline half operator+(const half& a, const half& b) {
return __float2half(__half2float(a) + __half2float(b));
}
template <>
__device__ inline half& operator+=(half& a, const half& b) {
a = __float2half(__half2float(a) + __half2float(b));
return a;
}
template <>
__device__ inline half operator-(const half& a, const half& b) {
return __float2half(__half2float(a) - __half2float(b));
}
template <>
__device__ inline half operator*(const half& a, const half& b) {
return __float2half(__half2float(a) * __half2float(b));
}
template <>
__device__ inline half operator/(const half& a, const half& b) {
return __float2half(__half2float(a) / __half2float(b));
}
#endif
template <>
__device__ inline half rsqrt(const half& x) {
#if __CUDA_ARCH__ >= 530
return hrsqrt(x);
#else
return __float2half(rsqrt(__half2float(x)));
#endif
}
__device__ inline kv_half operator+(const kv_half& a, const kv_half& b) {
const half2 a2 = __halves2half2(a.key, a.value);
const half2 b2 = __halves2half2(b.key, b.value);
const half2 res = __hadd2_with_fallback(a2, b2);
return kv_half(res.x, res.y);
}
__device__ inline kv_half2 operator+(const kv_half2& a, const kv_half2& b) {
return kv_half2(__hadd2_with_fallback(a.key, b.key), __hadd2_with_fallback(a.value, b.value));
}
// Helper Functions
template <typename T>
using kvp = cub::KeyValuePair<T, T>;
template <typename T, typename R, typename P, int TPB>
__device__ inline void layerNorm(
const kvp<R>& threadData, const int ld, const int offset, const P* beta, const P* gamma, T* output) {
// Assuming threadData is already divided by ld
using BlockReduce = cub::BlockReduce<kvp<R>, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ R mu; // mean
__shared__ R rsigma; // 1 / std.dev.
const auto sumKV = BlockReduce(temp_storage).Reduce(threadData, cub::Sum());
if (threadIdx.x == 0) {
mu = sumKV.key;
rsigma = rsqrt(sumKV.value - mu * mu);
}
__syncthreads();
for (int i = threadIdx.x; i < ld; i += TPB) {
const int idx = offset + i;
const R val = output[idx];
const R g(gamma[i]);
const R b(beta[i]);
output[idx] = g * (val - mu) * rsigma + b;
}
}
#endif // #ifndef COMMON_CUH
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
// AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
#define PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
#include <cuda_runtime.h>
#include <cstring>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include "NvInfer.h"
#include "NvInferPlugin.h"
typedef enum {
STATUS_SUCCESS = 0,
STATUS_FAILURE = 1,
STATUS_BAD_PARAM = 2,
STATUS_NOT_SUPPORTED = 3,
STATUS_NOT_INITIALIZED = 4
} pluginStatus_t;
namespace nvinfer1 {
class BasePlugin : public IPluginV2 {
protected:
void setPluginNamespace(const char* libNamespace) noexcept override {
mNamespace = libNamespace;
}
const char* getPluginNamespace() const noexcept override {
return mNamespace.c_str();
}
std::string mNamespace;
};
class BaseCreator : public IPluginCreator {
public:
void setPluginNamespace(const char* libNamespace) noexcept override {
mNamespace = libNamespace;
}
const char* getPluginNamespace() const noexcept override {
return mNamespace.c_str();
}
protected:
std::string mNamespace;
};
} // namespace nvinfer1
#endif // PADDLE_FLUID_INFERENCE_TENSORRT_PLUGIN_COMMON_PLUGIN_H_
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
// AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
#include <iostream>
using std::cerr;
using std::cout;
using std::endl;
template <typename T>
inline void serialize_value(void** buffer, T const& value);
template <typename T>
inline void deserialize_value(void const** buffer,
size_t* buffer_size,
T* value);
template <typename T, class Enable = void>
struct Serializer {};
template <typename T>
struct Serializer<T,
typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t serialized_size(T const&) { return sizeof(T); }
static void serialize(void** buffer, T const& value) {
::memcpy(*buffer, &value, sizeof(T));
reinterpret_cast<char*&>(*buffer) += sizeof(T);
}
static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
assert(*buffer_size >= sizeof(T));
::memcpy(value, *buffer, sizeof(T));
reinterpret_cast<char const*&>(*buffer) += sizeof(T);
*buffer_size -= sizeof(T);
}
};
template <>
struct Serializer<const char*> {
static size_t serialized_size(const char* value) { return strlen(value) + 1; }
static void serialize(void** buffer, const char* value) {
::snprintf(static_cast<char*>(*buffer), value);
reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
}
static void deserialize(void const** buffer,
size_t* buffer_size,
const char** value) {
*value = static_cast<char const*>(*buffer);
size_t data_size = strnlen(*value, *buffer_size) + 1;
assert(*buffer_size >= data_size);
reinterpret_cast<char const*&>(*buffer) += data_size;
*buffer_size -= data_size;
}
};
template <typename T>
struct Serializer<std::vector<T>,
typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t serialized_size(std::vector<T> const& value) {
return sizeof(value.size()) + value.size() * sizeof(T);
}
static void serialize(void** buffer, std::vector<T> const& value) {
serialize_value(buffer, value.size());
size_t nbyte = value.size() * sizeof(T);
::memcpy(*buffer, value.data(), nbyte);
reinterpret_cast<char*&>(*buffer) += nbyte;
}
static void deserialize(void const** buffer,
size_t* buffer_size,
std::vector<T>* value) {
size_t size;
deserialize_value(buffer, buffer_size, &size);
value->resize(size);
size_t nbyte = value->size() * sizeof(T);
assert(*buffer_size >= nbyte);
::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte;
}
};
template <typename T>
inline size_t serialized_size(T const& value) {
return Serializer<T>::serialized_size(value);
}
template <typename T>
inline void serialize_value(void** buffer, T const& value) {
return Serializer<T>::serialize(buffer, value);
}
template <typename T>
inline void deserialize_value(void const** buffer,
size_t* buffer_size,
T* value) {
return Serializer<T>::deserialize(buffer, buffer_size, value);
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
// AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda.h>
#include <cassert>
#include <cstring>
#include <iostream>
#include <vector>
#include "NvInfer.h"
#include "common/bertCommon.h"
#include "common/common.cuh"
#include "common/plugin.h"
#include "common/serialize.h"
// #include
// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
template <typename T, unsigned TPB>
__global__ void embLayerNormKernelHFace(int32_t ld,
int32_t** inputIds,
int32_t const nbLookupTables,
float const* beta,
float const* gamma,
T** mIdsEmbDev,
int32_t* IdsSize,
T* output) {
cub::Sum pairSum;
int32_t const s = blockIdx.x;
int32_t const b = blockIdx.y;
int32_t* cuSeqlens = inputIds[0];
int32_t const sumS = cuSeqlens[b];
int32_t const s_b = cuSeqlens[b + 1] - sumS;
if (s >= s_b) {
return; // This CTA has nothing to do
}
T const rld = T(1.f) / T(ld);
int32_t const seqPos = sumS + s;
extern __shared__ int32_t word_id[];
if (threadIdx.x == 0) {
for (int i = 1; i < nbLookupTables; ++i) {
if (static_cast<int32_t const*>(inputIds[i])[seqPos] < 0 ||
static_cast<int32_t const*>(inputIds[i])[seqPos] >= IdsSize[i]) {
printf(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max ");
return;
} else {
word_id[i - 1] = static_cast<int32_t const*>(inputIds[i])[seqPos];
}
}
}
__syncthreads();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t const poffset = blockIdx.x * ld;
int32_t const outOffset = seqPos * ld;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp<T> threadData(0, 0);
for (int32_t it = threadIdx.x; it < ld; it += TPB) {
T p(mIdsEmbDev[0][poffset + it]); // pos id
T val = p;
for (int i = 1; i < nbLookupTables; ++i) {
int32_t const offset = word_id[i - 1] * ld;
val += mIdsEmbDev[i][offset + it];
}
output[outOffset + it] = val;
T const rldval = rld * val;
threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
}
// 3. layer norm on the sum
layerNorm<T, T, float, TPB>(threadData, ld, outOffset, beta, gamma, output);
}
template <typename T>
int32_t embSkipLayerNormHFace(cudaStream_t stream,
int32_t ld,
int32_t B,
int32_t S,
int32_t** inputIds,
int32_t const nbLookupTables,
float const* beta,
float const* gamma,
T** mIdsEmbDev,
int32_t* IdsSize,
T* output) {
constexpr int32_t tpb = 256;
dim3 const grid(S, B, 1);
dim3 const block(tpb, 1, 1);
size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1);
embLayerNormKernelHFace<T, tpb><<<grid, block, cache_size, stream>>>(
ld, inputIds, nbLookupTables, beta, gamma, mIdsEmbDev, IdsSize, output);
return cudaPeekAtLastError();
}
template int32_t embSkipLayerNormHFace<float>(cudaStream_t,
int32_t,
int32_t,
int32_t,
int32_t**,
int32_t const,
float const*,
float const*,
float**,
int32_t*,
float*);
template int32_t embSkipLayerNormHFace<half>(cudaStream_t,
int32_t,
int32_t,
int32_t,
int32_t**,
int32_t const,
float const*,
float const*,
half**,
int32_t*,
half*);
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
// AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda.h>
#include <cassert>
#include <cstring>
#include <iostream>
#include <vector>
#include "NvInfer.h"
#include "common/bertCommon.h"
#include "common/common.cuh"
#include "common/plugin.h"
#include "common/serialize.h"
// #include
// "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
template <typename T, unsigned TPB>
__global__ void embLayerNormKernelMTron(int32_t ld,
int32_t** inputIds,
int32_t const nbLookupTables,
float const* beta,
float const* gamma,
T** mIdsEmbDev,
int32_t* IdsSize,
T* output,
T* skip) {
cub::Sum pairSum;
int32_t const s = blockIdx.x;
int32_t const b = blockIdx.y;
int32_t* cuSeqlens = inputIds[0];
int32_t const sumS = cuSeqlens[b];
int32_t const s_b = cuSeqlens[b + 1] - sumS;
if (s >= s_b) {
return; // This CTA has nothing to do
}
T const rld = T(1.f) / T(ld);
int32_t const seqPos = sumS + s;
extern __shared__ int32_t word_id[];
if (threadIdx.x == 0) {
for (int i = 1; i < nbLookupTables; ++i) {
if (static_cast<int32_t const*>(inputIds[i])[seqPos] < 0 ||
static_cast<int32_t const*>(inputIds[i])[seqPos] >= IdsSize[i]) {
printf(
"Error !!!!!!!!!!!!!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot "
"be lookup table: ID < 0 or ID > max ");
return;
} else {
word_id[i - 1] = static_cast<int32_t const*>(inputIds[i])[seqPos];
}
}
}
__syncthreads();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t const poffset = blockIdx.x * ld;
int32_t const outOffset = seqPos * ld;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp<T> threadData(0, 0);
for (int32_t it = threadIdx.x; it < ld; it += TPB) {
T p(mIdsEmbDev[0][poffset + it]); // pos id
T val = p;
for (int i = 1; i < nbLookupTables; ++i) {
int32_t const offset = word_id[i - 1] * ld;
val += mIdsEmbDev[i][offset + it];
}
output[outOffset + it] = val;
skip[outOffset + it] = val;
T const rldval = rld * val;
threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
}
// 3. layer norm on the sum
layerNorm<T, T, float, TPB>(threadData, ld, outOffset, beta, gamma, output);
}
template <typename T>
int32_t embSkipLayerNormMTron(cudaStream_t stream,
int32_t ld,
int32_t B,
int32_t S,
int32_t** inputIds,
int32_t const nbLookupTables,
float const* beta,
float const* gamma,
T** mIdsEmbDev,
int32_t* IdsSize,
T* output,
T* skip) {
constexpr int32_t tpb = 256;
dim3 const grid(S, B, 1);
dim3 const block(tpb, 1, 1);
size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1);
embLayerNormKernelMTron<T, tpb>
<<<grid, block, cache_size, stream>>>(ld,
inputIds,
nbLookupTables,
beta,
gamma,
mIdsEmbDev,
IdsSize,
output,
skip);
return cudaPeekAtLastError();
}
template int32_t embSkipLayerNormMTron<float>(cudaStream_t,
int32_t,
int32_t,
int32_t,
int32_t**,
int32_t const,
float const*,
float const*,
float**,
int32_t*,
float*,
float*);
template int32_t embSkipLayerNormMTron<half>(cudaStream_t,
int32_t,
int32_t,
int32_t,
int32_t**,
int32_t const,
float const*,
float const*,
half**,
int32_t*,
half*,
half*);
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -135,7 +135,12 @@ static void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -135,7 +135,12 @@ static void trt_ernie(bool with_fp16, std::vector<float> result) {
if (with_fp16) { if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf; precision = AnalysisConfig::Precision::kHalf;
} }
#if defined _WIN32
#else
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false); config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
#endif
config.SetTRTDynamicShapeInfo( config.SetTRTDynamicShapeInfo(
min_input_shape, max_input_shape, opt_input_shape); min_input_shape, max_input_shape, opt_input_shape);
AnalysisConfig* config_deser = new AnalysisConfig(config); AnalysisConfig* config_deser = new AnalysisConfig(config);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册