未验证 提交 b7a1ae22 编写于 作者: C ccrrong 提交者: GitHub

add layer_norm trt fp16 support (#45043)

* add fp16 support

* update

* update half

* code format

* fix unittest

* fix rocm compile error

* code format

* code format

* fix rocm compile error

* fix rocm compile error
上级 dc31d2aa
......@@ -60,6 +60,8 @@ class LayerNormOpConverter : public OpConverter {
// the shape of mean and variance will be determine in configuPlugin.
std::vector<int64_t> mean_shape{1};
std::vector<int64_t> variance_shape{1};
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::LayerNormPluginDynamic* plugin =
new plugin::LayerNormPluginDynamic(
static_cast<const float*>(bias_weight.get().values),
......@@ -69,7 +71,8 @@ class LayerNormOpConverter : public OpConverter {
begin_norm_axis,
eps,
mean_shape,
variance_shape);
variance_shape,
with_fp16);
layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
} else {
int statis_num = 1;
......@@ -78,6 +81,8 @@ class LayerNormOpConverter : public OpConverter {
}
std::vector<int64_t> mean_shape{statis_num};
std::vector<int64_t> variance_shape{statis_num};
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
static_cast<const float*>(bias_weight.get().values),
bias_weight.get().count,
......@@ -86,7 +91,8 @@ class LayerNormOpConverter : public OpConverter {
begin_norm_axis,
eps,
mean_shape,
variance_shape);
variance_shape,
with_fp16);
layernorm_layer = engine_->AddPlugin(
&X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
}
......
......@@ -26,7 +26,30 @@ namespace inference {
namespace tensorrt {
namespace plugin {
int LayerNormPlugin::initialize() TRT_NOEXCEPT { return 0; }
int LayerNormPlugin::initialize() TRT_NOEXCEPT {
cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size());
cudaMemcpy(bias_gpu_,
bias_.data(),
bias_.size() * sizeof(float),
cudaMemcpyHostToDevice);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size());
cudaMemcpy(scale_gpu_,
scale_.data(),
scale_.size() * sizeof(float),
cudaMemcpyHostToDevice);
return 0;
}
void LayerNormPlugin::terminate() TRT_NOEXCEPT {
if (bias_gpu_) {
cudaFree(bias_gpu_);
bias_gpu_ = nullptr;
}
if (scale_gpu_) {
cudaFree(scale_gpu_);
scale_gpu_ = nullptr;
}
}
nvinfer1::Dims LayerNormPlugin::getOutputDimensions(
int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
......@@ -37,6 +60,18 @@ nvinfer1::Dims LayerNormPlugin::getOutputDimensions(
return output_dims;
}
bool LayerNormPlugin::supportsFormat(
nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
if (with_fp16_) {
return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kLINEAR));
} else {
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kLINEAR));
}
}
int LayerNormPlugin::enqueue(int batch_size,
const void *const *inputs,
#if IS_TRT_VERSION_LT(8000)
......@@ -48,8 +83,6 @@ int LayerNormPlugin::enqueue(int batch_size,
#endif
cudaStream_t stream) TRT_NOEXCEPT {
const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]);
float *output = reinterpret_cast<float *const *>(outputs)[0];
int begin_norm_axis = begin_norm_axis_;
float eps = eps_;
......@@ -92,42 +125,76 @@ int LayerNormPlugin::enqueue(int batch_size,
feature_size,
bias_.size()));
scale_t.Resize(phi::make_ddim({feature_size}));
bias_t.Resize(phi::make_ddim({feature_size}));
mean_t.Resize(phi::make_ddim({batched_mean_shape}));
variance_t.Resize(phi::make_ddim({batched_variance_shape}));
int device_id;
cudaGetDevice(&device_id);
float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
mean_t.Resize(phi::make_ddim({batched_mean_shape}));
variance_t.Resize(phi::make_ddim({batched_variance_shape}));
float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *variance_d =
variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
cudaMemcpyAsync(scale_d,
scale_.data(),
sizeof(float) * feature_size,
cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(bias_d,
bias_.data(),
sizeof(float) * feature_size,
cudaMemcpyHostToDevice,
stream);
phi::LayerNormDirectCUDAFunctor<float> layer_norm;
layer_norm(stream,
input,
input_shape,
bias_d,
scale_d,
output,
mean_d,
variance_d,
begin_norm_axis,
eps);
auto input_type = getDataType();
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
const float *input = reinterpret_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
phi::LayerNormDirectCUDAFunctor<float, float> layer_norm;
layer_norm(stream,
input,
input_shape,
bias_gpu_,
scale_gpu_,
output,
mean_d,
variance_d,
begin_norm_axis,
eps);
} else if (input_type == nvinfer1::DataType::kHALF) {
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
const half *input = reinterpret_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
phi::LayerNormDirectCUDAFunctor<half, float> layer_norm;
layer_norm(stream,
input,
input_shape,
bias_gpu_,
scale_gpu_,
output,
mean_d,
variance_d,
begin_norm_axis,
eps);
} else {
PADDLE_THROW(platform::errors::Fatal(
"The LayerNorm TRT Plugin's input type should be float or half."));
}
return cudaGetLastError() != cudaSuccess;
}
int LayerNormPluginDynamic::initialize() TRT_NOEXCEPT {
cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size());
cudaMemcpy(bias_gpu_,
bias_.data(),
bias_.size() * sizeof(float),
cudaMemcpyHostToDevice);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size());
cudaMemcpy(scale_gpu_,
scale_.data(),
scale_.size() * sizeof(float),
cudaMemcpyHostToDevice);
return 0;
}
void LayerNormPluginDynamic::terminate() TRT_NOEXCEPT {
if (bias_gpu_) {
cudaFree(bias_gpu_);
bias_gpu_ = nullptr;
}
if (scale_gpu_) {
cudaFree(scale_gpu_);
scale_gpu_ = nullptr;
}
}
nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
int output_index,
const nvinfer1::DimsExprs *inputDims,
......@@ -154,9 +221,14 @@ bool LayerNormPluginDynamic::supportsFormatCombination(
nb_inputs + nb_outputs));
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
// TODO(Shangzhizhou) FP16 support
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
if (with_fp16_) {
return ((in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::PluginFormat::kLINEAR));
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output
......@@ -187,6 +259,11 @@ nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
"The LayerNormPlugin only has one input, so the "
"index value should be 0, but get %d.",
index));
PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
input_types[0] == nvinfer1::DataType::kHALF),
true,
platform::errors::InvalidArgument(
"The input type should be half or float"));
return input_types[0];
}
......@@ -249,42 +326,40 @@ int LayerNormPluginDynamic::enqueue(
"but got feature_size:%d, bias's size:%d.",
feature_size,
bias_.size()));
int device_id;
cudaGetDevice(&device_id);
mean_t.Resize(phi::make_ddim(mean_shape_));
variance_t.Resize(phi::make_ddim(variance_shape_));
float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *variance_d =
variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
const float *input = reinterpret_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
scale_t.Resize(phi::make_ddim({feature_size}));
bias_t.Resize(phi::make_ddim({feature_size}));
mean_t.Resize(phi::make_ddim(mean_shape_));
variance_t.Resize(phi::make_ddim(variance_shape_));
float *scale_d =
scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
float *variance_d =
variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
cudaMemcpyAsync(scale_d,
scale_.data(),
sizeof(float) * feature_size,
cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(bias_d,
bias_.data(),
sizeof(float) * feature_size,
cudaMemcpyHostToDevice,
stream);
phi::LayerNormDirectCUDAFunctor<float> layer_norm;
phi::LayerNormDirectCUDAFunctor<float, float> layer_norm;
layer_norm(stream,
input,
input_shape,
bias_gpu_,
scale_gpu_,
output,
mean_d,
variance_d,
begin_norm_axis,
eps);
} else if (input_type == nvinfer1::DataType::kHALF) {
VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
const half *input = reinterpret_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]);
phi::LayerNormDirectCUDAFunctor<half, float> layer_norm;
layer_norm(stream,
input,
input_shape,
bias_d,
scale_d,
bias_gpu_,
scale_gpu_,
output,
mean_d,
variance_d,
......@@ -292,7 +367,7 @@ int LayerNormPluginDynamic::enqueue(
eps);
} else {
PADDLE_THROW(platform::errors::Fatal(
"The LayerNorm TRT Plugin's input type should be float."));
"The LayerNorm TRT Plugin's input type should be float or half."));
}
return cudaGetLastError() != cudaSuccess;
}
......
......@@ -31,8 +31,6 @@ namespace plugin {
class LayerNormPlugin : public PluginTensorRT {
std::vector<float> bias_;
std::vector<float> scale_;
framework::Tensor scale_t;
framework::Tensor bias_t;
framework::Tensor mean_t;
framework::Tensor variance_t;
int begin_norm_axis_;
......@@ -40,12 +38,16 @@ class LayerNormPlugin : public PluginTensorRT {
std::vector<int64_t> mean_shape_;
std::vector<int64_t> variance_shape_;
// data on devices
float* bias_gpu_{nullptr};
float* scale_gpu_{nullptr};
public:
size_t getSerializationSize() const TRT_NOEXCEPT override {
return getBaseSerializationSize() + SerializedSize(bias_) +
SerializedSize(scale_) + SerializedSize(begin_norm_axis_) +
SerializedSize(eps_) + SerializedSize(mean_shape_) +
SerializedSize(variance_shape_);
SerializedSize(variance_shape_) + SerializedSize(with_fp16_);
}
// TRT will call this func when we need to serialize the configuration of
......@@ -59,6 +61,7 @@ class LayerNormPlugin : public PluginTensorRT {
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, mean_shape_);
SerializeValue(&buffer, variance_shape_);
SerializeValue(&buffer, with_fp16_);
}
LayerNormPlugin(const float* bias,
......@@ -68,11 +71,13 @@ class LayerNormPlugin : public PluginTensorRT {
int begin_norm_axis,
float eps,
std::vector<int64_t> mean_shape,
std::vector<int64_t> variance_shape)
std::vector<int64_t> variance_shape,
bool with_fp16)
: begin_norm_axis_(begin_norm_axis),
eps_(eps),
mean_shape_(mean_shape),
variance_shape_(variance_shape) {
with_fp16_ = with_fp16;
bias_.resize(bias_num);
scale_.resize(scale_num);
std::copy(bias, bias + bias_num, bias_.data());
......@@ -89,24 +94,33 @@ class LayerNormPlugin : public PluginTensorRT {
DeserializeValue(&serialData, &serialLength, &eps_);
DeserializeValue(&serialData, &serialLength, &mean_shape_);
DeserializeValue(&serialData, &serialLength, &variance_shape_);
DeserializeValue(&serialData, &serialLength, &with_fp16_);
}
~LayerNormPlugin() {}
int initialize() TRT_NOEXCEPT override;
void terminate() TRT_NOEXCEPT override;
LayerNormPlugin* clone() const TRT_NOEXCEPT override {
return new LayerNormPlugin(bias_.data(),
bias_.size(),
scale_.data(),
scale_.size(),
begin_norm_axis_,
eps_,
mean_shape_,
variance_shape_);
auto ptr = new LayerNormPlugin(bias_.data(),
bias_.size(),
scale_.data(),
scale_.size(),
begin_norm_axis_,
eps_,
mean_shape_,
variance_shape_,
with_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "layernorm_plugin";
}
bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* inputs,
......@@ -150,11 +164,13 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
int begin_norm_axis,
float eps,
std::vector<int64_t> mean_shape,
std::vector<int64_t> variance_shape)
std::vector<int64_t> variance_shape,
bool with_fp16)
: begin_norm_axis_(begin_norm_axis),
eps_(eps),
mean_shape_(mean_shape),
variance_shape_(variance_shape) {
with_fp16_ = with_fp16;
bias_.resize(bias_num);
scale_.resize(scale_num);
std::copy(bias, bias + bias_num, bias_.data());
......@@ -168,28 +184,35 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue(&serialData, &serialLength, &eps_);
DeserializeValue(&serialData, &serialLength, &mean_shape_);
DeserializeValue(&serialData, &serialLength, &variance_shape_);
DeserializeValue(&serialData, &serialLength, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
return new LayerNormPluginDynamic(bias_.data(),
bias_.size(),
scale_.data(),
scale_.size(),
begin_norm_axis_,
eps_,
mean_shape_,
variance_shape_);
auto ptr = new LayerNormPluginDynamic(bias_.data(),
bias_.size(),
scale_.data(),
scale_.size(),
begin_norm_axis_,
eps_,
mean_shape_,
variance_shape_,
with_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "layernorm_plugin_dynamic";
}
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
int initialize() TRT_NOEXCEPT override { return 0; }
int initialize() TRT_NOEXCEPT override;
void terminate() TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override {
return SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
SerializedSize(mean_shape_) + SerializedSize(variance_shape_) +
SerializedSize(with_fp16_);
}
void serialize(void* buffer) const TRT_NOEXCEPT override {
......@@ -199,6 +222,7 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, mean_shape_);
SerializeValue(&buffer, variance_shape_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions(int output_index,
......@@ -240,14 +264,15 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
private:
std::vector<float> bias_;
std::vector<float> scale_;
framework::Tensor scale_t;
framework::Tensor bias_t;
framework::Tensor mean_t;
framework::Tensor variance_t;
int begin_norm_axis_;
float eps_;
std::vector<int64_t> mean_shape_;
std::vector<int64_t> variance_shape_;
// data on devices
float* bias_gpu_{nullptr};
float* scale_gpu_{nullptr};
};
class LayerNormPluginDynamicCreator : public TensorRTPluginCreator {
......
......@@ -379,7 +379,8 @@ __global__ void LayerNormForward(
var_val = BlockReduceSum<U>(var_val, shared_var);
if (threadIdx.x == 0) {
auto scale = static_cast<float>(1.) / static_cast<float>(feature_size);
auto scale = static_cast<U>(static_cast<float>(1.) /
static_cast<float>(feature_size));
auto tmp = mean_val * scale;
mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
var_share = static_cast<U>(var_val * scale - mean_share * mean_share);
......
......@@ -21,24 +21,24 @@
namespace phi {
template <typename T>
void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
const T *input,
std::vector<int> input_shape,
const T *bias,
const T *scale,
T *output,
T *mean,
T *variance,
int begin_norm_axis,
float eps) {
template <typename T, typename U>
void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
const T *input,
std::vector<int> input_shape,
const U *bias,
const U *scale,
T *output,
U *mean,
U *variance,
int begin_norm_axis,
float eps) {
const auto x_dims = phi::make_ddim(input_shape);
auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
paddle::operators::LayerNormForward<T, T, kBlockDim>
paddle::operators::LayerNormForward<T, U, kBlockDim>
<<<batch_size, kBlockDim, 0, stream>>>(
input, scale, bias, output, mean, variance, eps, feature_size));
default:
......@@ -49,7 +49,10 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
}
}
template class LayerNormDirectCUDAFunctor<float>;
template class LayerNormDirectCUDAFunctor<float, float>;
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
template class LayerNormDirectCUDAFunctor<half, float>;
#endif
template <typename T, typename Context>
void LayerNormKernel(const Context &dev_ctx,
......
......@@ -32,17 +32,17 @@ void LayerNormKernel(const Context& ctx,
DenseTensor* variance);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T>
template <typename T, typename U>
class LayerNormDirectCUDAFunctor {
public:
void operator()(gpuStream_t stream,
const T* input,
std::vector<int> input_shape,
const T* bias,
const T* scale,
const U* bias,
const U* scale,
T* output,
T* mean,
T* variance,
U* mean,
U* variance,
int begin_norm_axis,
float eps);
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册