From b7a1ae226a0d8684d0f4b04ddcdd9155d5e6a9ec Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Wed, 21 Sep 2022 21:17:38 +0800 Subject: [PATCH] add layer_norm trt fp16 support (#45043) * add fp16 support * update * update half * code format * fix unittest * fix rocm compile error * code format * code format * fix rocm compile error * fix rocm compile error --- .../tensorrt/convert/layer_norm_op.cc | 10 +- .../tensorrt/plugin/layer_norm_op_plugin.cu | 197 ++++++++++++------ .../tensorrt/plugin/layer_norm_op_plugin.h | 75 ++++--- paddle/fluid/operators/layer_norm_kernel.cu.h | 3 +- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 29 +-- paddle/phi/kernels/layer_norm_kernel.h | 10 +- 6 files changed, 217 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index 0eed1a4f5e7..b6f34a8549d 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -60,6 +60,8 @@ class LayerNormOpConverter : public OpConverter { // the shape of mean and variance will be determine in configuPlugin. std::vector mean_shape{1}; std::vector variance_shape{1}; + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::LayerNormPluginDynamic* plugin = new plugin::LayerNormPluginDynamic( static_cast(bias_weight.get().values), @@ -69,7 +71,8 @@ class LayerNormOpConverter : public OpConverter { begin_norm_axis, eps, mean_shape, - variance_shape); + variance_shape, + with_fp16); layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin); } else { int statis_num = 1; @@ -78,6 +81,8 @@ class LayerNormOpConverter : public OpConverter { } std::vector mean_shape{statis_num}; std::vector variance_shape{statis_num}; + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin( static_cast(bias_weight.get().values), bias_weight.get().count, @@ -86,7 +91,8 @@ class LayerNormOpConverter : public OpConverter { begin_norm_axis, eps, mean_shape, - variance_shape); + variance_shape, + with_fp16); layernorm_layer = engine_->AddPlugin( &X, 1, reinterpret_cast(plugin)); } diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index da4ebdc6cb6..09a93d661bd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -26,7 +26,30 @@ namespace inference { namespace tensorrt { namespace plugin { -int LayerNormPlugin::initialize() TRT_NOEXCEPT { return 0; } +int LayerNormPlugin::initialize() TRT_NOEXCEPT { + cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size()); + cudaMemcpy(bias_gpu_, + bias_.data(), + bias_.size() * sizeof(float), + cudaMemcpyHostToDevice); + cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size()); + cudaMemcpy(scale_gpu_, + scale_.data(), + scale_.size() * sizeof(float), + cudaMemcpyHostToDevice); + return 0; +} + +void LayerNormPlugin::terminate() TRT_NOEXCEPT { + if (bias_gpu_) { + cudaFree(bias_gpu_); + bias_gpu_ = nullptr; + } + if (scale_gpu_) { + cudaFree(scale_gpu_); + scale_gpu_ = nullptr; + } +} nvinfer1::Dims LayerNormPlugin::getOutputDimensions( int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT { @@ -37,6 +60,18 @@ nvinfer1::Dims LayerNormPlugin::getOutputDimensions( return output_dims; } +bool LayerNormPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT { + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } +} + int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, #if IS_TRT_VERSION_LT(8000) @@ -48,8 +83,6 @@ int LayerNormPlugin::enqueue(int batch_size, #endif cudaStream_t stream) TRT_NOEXCEPT { const auto &input_dims = this->getInputDims(0); - const float *input = reinterpret_cast(inputs[0]); - float *output = reinterpret_cast(outputs)[0]; int begin_norm_axis = begin_norm_axis_; float eps = eps_; @@ -92,42 +125,76 @@ int LayerNormPlugin::enqueue(int batch_size, feature_size, bias_.size())); - scale_t.Resize(phi::make_ddim({feature_size})); - bias_t.Resize(phi::make_ddim({feature_size})); - mean_t.Resize(phi::make_ddim({batched_mean_shape})); - variance_t.Resize(phi::make_ddim({batched_variance_shape})); int device_id; cudaGetDevice(&device_id); - float *scale_d = scale_t.mutable_data(platform::CUDAPlace(device_id)); - float *bias_d = bias_t.mutable_data(platform::CUDAPlace(device_id)); + mean_t.Resize(phi::make_ddim({batched_mean_shape})); + variance_t.Resize(phi::make_ddim({batched_variance_shape})); float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); float *variance_d = variance_t.mutable_data(platform::CUDAPlace(device_id)); - cudaMemcpyAsync(scale_d, - scale_.data(), - sizeof(float) * feature_size, - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(bias_d, - bias_.data(), - sizeof(float) * feature_size, - cudaMemcpyHostToDevice, - stream); - - phi::LayerNormDirectCUDAFunctor layer_norm; - layer_norm(stream, - input, - input_shape, - bias_d, - scale_d, - output, - mean_d, - variance_d, - begin_norm_axis, - eps); + auto input_type = getDataType(); + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32"; + const float *input = reinterpret_cast(inputs[0]); + float *output = static_cast(outputs[0]); + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + input, + input_shape, + bias_gpu_, + scale_gpu_, + output, + mean_d, + variance_d, + begin_norm_axis, + eps); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16"; + const half *input = reinterpret_cast(inputs[0]); + half *output = static_cast(outputs[0]); + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + input, + input_shape, + bias_gpu_, + scale_gpu_, + output, + mean_d, + variance_d, + begin_norm_axis, + eps); + } else { + PADDLE_THROW(platform::errors::Fatal( + "The LayerNorm TRT Plugin's input type should be float or half.")); + } return cudaGetLastError() != cudaSuccess; } +int LayerNormPluginDynamic::initialize() TRT_NOEXCEPT { + cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size()); + cudaMemcpy(bias_gpu_, + bias_.data(), + bias_.size() * sizeof(float), + cudaMemcpyHostToDevice); + cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size()); + cudaMemcpy(scale_gpu_, + scale_.data(), + scale_.size() * sizeof(float), + cudaMemcpyHostToDevice); + return 0; +} + +void LayerNormPluginDynamic::terminate() TRT_NOEXCEPT { + if (bias_gpu_) { + cudaFree(bias_gpu_); + bias_gpu_ = nullptr; + } + if (scale_gpu_) { + cudaFree(scale_gpu_); + scale_gpu_ = nullptr; + } +} + nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputDims, @@ -154,9 +221,14 @@ bool LayerNormPluginDynamic::supportsFormatCombination( nb_inputs + nb_outputs)); const nvinfer1::PluginTensorDesc &in = in_out[pos]; if (pos == 0) { - // TODO(Shangzhizhou) FP16 support - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); + if (with_fp16_) { + return ((in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; // output @@ -187,6 +259,11 @@ nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType( "The LayerNormPlugin only has one input, so the " "index value should be 0, but get %d.", index)); + PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT || + input_types[0] == nvinfer1::DataType::kHALF), + true, + platform::errors::InvalidArgument( + "The input type should be half or float")); return input_types[0]; } @@ -249,42 +326,40 @@ int LayerNormPluginDynamic::enqueue( "but got feature_size:%d, bias's size:%d.", feature_size, bias_.size())); + int device_id; cudaGetDevice(&device_id); + mean_t.Resize(phi::make_ddim(mean_shape_)); + variance_t.Resize(phi::make_ddim(variance_shape_)); + float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); + float *variance_d = + variance_t.mutable_data(platform::CUDAPlace(device_id)); auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32"; const float *input = reinterpret_cast(inputs[0]); float *output = static_cast(outputs[0]); - scale_t.Resize(phi::make_ddim({feature_size})); - bias_t.Resize(phi::make_ddim({feature_size})); - mean_t.Resize(phi::make_ddim(mean_shape_)); - variance_t.Resize(phi::make_ddim(variance_shape_)); - - float *scale_d = - scale_t.mutable_data(platform::CUDAPlace(device_id)); - float *bias_d = bias_t.mutable_data(platform::CUDAPlace(device_id)); - float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); - float *variance_d = - variance_t.mutable_data(platform::CUDAPlace(device_id)); - - cudaMemcpyAsync(scale_d, - scale_.data(), - sizeof(float) * feature_size, - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(bias_d, - bias_.data(), - sizeof(float) * feature_size, - cudaMemcpyHostToDevice, - stream); - - phi::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + input, + input_shape, + bias_gpu_, + scale_gpu_, + output, + mean_d, + variance_d, + begin_norm_axis, + eps); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16"; + const half *input = reinterpret_cast(inputs[0]); + half *output = static_cast(outputs[0]); + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, - bias_d, - scale_d, + bias_gpu_, + scale_gpu_, output, mean_d, variance_d, @@ -292,7 +367,7 @@ int LayerNormPluginDynamic::enqueue( eps); } else { PADDLE_THROW(platform::errors::Fatal( - "The LayerNorm TRT Plugin's input type should be float.")); + "The LayerNorm TRT Plugin's input type should be float or half.")); } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h index a8ccabb3cff..014a1eca030 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h @@ -31,8 +31,6 @@ namespace plugin { class LayerNormPlugin : public PluginTensorRT { std::vector bias_; std::vector scale_; - framework::Tensor scale_t; - framework::Tensor bias_t; framework::Tensor mean_t; framework::Tensor variance_t; int begin_norm_axis_; @@ -40,12 +38,16 @@ class LayerNormPlugin : public PluginTensorRT { std::vector mean_shape_; std::vector variance_shape_; + // data on devices + float* bias_gpu_{nullptr}; + float* scale_gpu_{nullptr}; + public: size_t getSerializationSize() const TRT_NOEXCEPT override { return getBaseSerializationSize() + SerializedSize(bias_) + SerializedSize(scale_) + SerializedSize(begin_norm_axis_) + SerializedSize(eps_) + SerializedSize(mean_shape_) + - SerializedSize(variance_shape_); + SerializedSize(variance_shape_) + SerializedSize(with_fp16_); } // TRT will call this func when we need to serialize the configuration of @@ -59,6 +61,7 @@ class LayerNormPlugin : public PluginTensorRT { SerializeValue(&buffer, eps_); SerializeValue(&buffer, mean_shape_); SerializeValue(&buffer, variance_shape_); + SerializeValue(&buffer, with_fp16_); } LayerNormPlugin(const float* bias, @@ -68,11 +71,13 @@ class LayerNormPlugin : public PluginTensorRT { int begin_norm_axis, float eps, std::vector mean_shape, - std::vector variance_shape) + std::vector variance_shape, + bool with_fp16) : begin_norm_axis_(begin_norm_axis), eps_(eps), mean_shape_(mean_shape), variance_shape_(variance_shape) { + with_fp16_ = with_fp16; bias_.resize(bias_num); scale_.resize(scale_num); std::copy(bias, bias + bias_num, bias_.data()); @@ -89,24 +94,33 @@ class LayerNormPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &eps_); DeserializeValue(&serialData, &serialLength, &mean_shape_); DeserializeValue(&serialData, &serialLength, &variance_shape_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); } ~LayerNormPlugin() {} int initialize() TRT_NOEXCEPT override; + void terminate() TRT_NOEXCEPT override; LayerNormPlugin* clone() const TRT_NOEXCEPT override { - return new LayerNormPlugin(bias_.data(), - bias_.size(), - scale_.data(), - scale_.size(), - begin_norm_axis_, - eps_, - mean_shape_, - variance_shape_); + auto ptr = new LayerNormPlugin(bias_.data(), + bias_.size(), + scale_.data(), + scale_.size(), + begin_norm_axis_, + eps_, + mean_shape_, + variance_shape_, + with_fp16_); + ptr->bias_gpu_ = bias_gpu_; + ptr->scale_gpu_ = scale_gpu_; + return ptr; } const char* getPluginType() const TRT_NOEXCEPT override { return "layernorm_plugin"; } + bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) + const TRT_NOEXCEPT override; + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, @@ -150,11 +164,13 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT { int begin_norm_axis, float eps, std::vector mean_shape, - std::vector variance_shape) + std::vector variance_shape, + bool with_fp16) : begin_norm_axis_(begin_norm_axis), eps_(eps), mean_shape_(mean_shape), variance_shape_(variance_shape) { + with_fp16_ = with_fp16; bias_.resize(bias_num); scale_.resize(scale_num); std::copy(bias, bias + bias_num, bias_.data()); @@ -168,28 +184,35 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT { DeserializeValue(&serialData, &serialLength, &eps_); DeserializeValue(&serialData, &serialLength, &mean_shape_); DeserializeValue(&serialData, &serialLength, &variance_shape_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); } nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { - return new LayerNormPluginDynamic(bias_.data(), - bias_.size(), - scale_.data(), - scale_.size(), - begin_norm_axis_, - eps_, - mean_shape_, - variance_shape_); + auto ptr = new LayerNormPluginDynamic(bias_.data(), + bias_.size(), + scale_.data(), + scale_.size(), + begin_norm_axis_, + eps_, + mean_shape_, + variance_shape_, + with_fp16_); + ptr->bias_gpu_ = bias_gpu_; + ptr->scale_gpu_ = scale_gpu_; + return ptr; } const char* getPluginType() const TRT_NOEXCEPT override { return "layernorm_plugin_dynamic"; } int getNbOutputs() const TRT_NOEXCEPT override { return 1; } - int initialize() TRT_NOEXCEPT override { return 0; } + int initialize() TRT_NOEXCEPT override; + void terminate() TRT_NOEXCEPT override; size_t getSerializationSize() const TRT_NOEXCEPT override { return SerializedSize(bias_) + SerializedSize(scale_) + SerializedSize(begin_norm_axis_) + SerializedSize(eps_) + - SerializedSize(mean_shape_) + SerializedSize(variance_shape_); + SerializedSize(mean_shape_) + SerializedSize(variance_shape_) + + SerializedSize(with_fp16_); } void serialize(void* buffer) const TRT_NOEXCEPT override { @@ -199,6 +222,7 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT { SerializeValue(&buffer, eps_); SerializeValue(&buffer, mean_shape_); SerializeValue(&buffer, variance_shape_); + SerializeValue(&buffer, with_fp16_); } nvinfer1::DimsExprs getOutputDimensions(int output_index, @@ -240,14 +264,15 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT { private: std::vector bias_; std::vector scale_; - framework::Tensor scale_t; - framework::Tensor bias_t; framework::Tensor mean_t; framework::Tensor variance_t; int begin_norm_axis_; float eps_; std::vector mean_shape_; std::vector variance_shape_; + // data on devices + float* bias_gpu_{nullptr}; + float* scale_gpu_{nullptr}; }; class LayerNormPluginDynamicCreator : public TensorRTPluginCreator { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 899eae3efb4..af3d4cec397 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -379,7 +379,8 @@ __global__ void LayerNormForward( var_val = BlockReduceSum(var_val, shared_var); if (threadIdx.x == 0) { - auto scale = static_cast(1.) / static_cast(feature_size); + auto scale = static_cast(static_cast(1.) / + static_cast(feature_size)); auto tmp = mean_val * scale; mean[blockIdx.x] = mean_share = static_cast(tmp); var_share = static_cast(var_val * scale - mean_share * mean_share); diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 4922ba0cc27..f2ad08c6800 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -21,24 +21,24 @@ namespace phi { -template -void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - std::vector input_shape, - const T *bias, - const T *scale, - T *output, - T *mean, - T *variance, - int begin_norm_axis, - float eps) { +template +void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + std::vector input_shape, + const U *bias, + const U *scale, + T *output, + U *mean, + U *variance, + int begin_norm_axis, + float eps) { const auto x_dims = phi::make_ddim(input_shape); auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); int64_t batch_size = static_cast(matrix_dim[0]); int64_t feature_size = static_cast(matrix_dim[1]); switch (paddle::operators::GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - paddle::operators::LayerNormForward + paddle::operators::LayerNormForward <<>>( input, scale, bias, output, mean, variance, eps, feature_size)); default: @@ -49,7 +49,10 @@ void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, } } -template class LayerNormDirectCUDAFunctor; +template class LayerNormDirectCUDAFunctor; +#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) +template class LayerNormDirectCUDAFunctor; +#endif template void LayerNormKernel(const Context &dev_ctx, diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h index 26c04b61af9..28ffdfd4771 100644 --- a/paddle/phi/kernels/layer_norm_kernel.h +++ b/paddle/phi/kernels/layer_norm_kernel.h @@ -32,17 +32,17 @@ void LayerNormKernel(const Context& ctx, DenseTensor* variance); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template +template class LayerNormDirectCUDAFunctor { public: void operator()(gpuStream_t stream, const T* input, std::vector input_shape, - const T* bias, - const T* scale, + const U* bias, + const U* scale, T* output, - T* mean, - T* variance, + U* mean, + U* variance, int begin_norm_axis, float eps); }; -- GitLab