From 7584bb50962d60aaa684201bb5180a0589345bd5 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Wed, 2 Dec 2020 14:50:48 +0800 Subject: [PATCH] Layer norm fp16 (#29169) * add fp16 for layer_norm op * revert layernorm api * fix forward * fix forward * fix backward for layernorm with fp16 * fix unit test for layernorm with fp16 * fix with_mkldnn compile error for layernorm with fp16 * 1. revert to PADDLE_ENFORCE_NOT_NULL, 2. change static_cast to static_cast * fix with_mkldnn compile error for layernorm with fp16 * fix with_mkldnn compile error for layernorm with fp16 Co-authored-by: zhiqiu --- paddle/fluid/operators/layer_norm_op.cc | 35 ++- paddle/fluid/operators/layer_norm_op.cu | 264 ++++++++++-------- .../contrib/mixed_precision/fp16_lists.py | 4 +- .../contrib/mixed_precision/fp16_utils.py | 7 +- .../tests/unittests/test_layer_norm_op.py | 5 +- python/paddle/nn/functional/norm.py | 11 +- 6 files changed, 203 insertions(+), 123 deletions(-) diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index 6f83a667a59..23de34bc6fa 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/layer_norm_op.h" #include +#include #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -98,7 +99,26 @@ class LayerNormOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { + const framework::ExecutionContext &ctx) const override { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto ln_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + ln_param_type = framework::proto::VarType::FP64; + } + if (ctx.HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ln_param_type, ctx.Input("Scale")->type(), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + } + if (ctx.HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ln_param_type, ctx.Input("Bias")->type(), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + } + framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; @@ -110,9 +130,8 @@ class LayerNormOp : public framework::OperatorWithKernel { } #endif - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); } }; @@ -224,7 +243,13 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } PADDLE_ENFORCE_NOT_NULL( t, platform::errors::NotFound("Y@GRAD of LayerNorm Op is not found.")); - return framework::OpKernelType(t->type(), ctx.GetPlace()); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), + layout, library); } }; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 30bafb5c13e..0d877fe2324 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -15,12 +15,22 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; +template +using LayerNormParamType = typename CudnnDataType::BatchNormParamType; + inline static int GetDesiredBlockDim(int block_dim) { const int kMaxBlockDim = 512; return block_dim >= kMaxBlockDim @@ -97,9 +107,9 @@ struct PairForLayerNormAddFunctor { } }; -template -__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, - T *y, T *mean, T *var, float epsilon, +template +__global__ void LayerNormForward(const T *x, const U *scale, const U *bias, + T *y, U *mean, U *var, float epsilon, int feature_size) { using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -111,7 +121,7 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias, double mean_val = 0; double var_val = 0; for (int i = beg_idx; i < end_idx; i += BlockDim) { - T tmp = x[i]; + U tmp = static_cast(x[i]); mean_val += tmp; var_val += (tmp * tmp); } @@ -120,36 +130,39 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias, PairForLayerNormAddFunctor()); if (threadIdx.x == 0) { auto tmp = pair.first_ / feature_size; - mean[blockIdx.x] = static_cast(tmp); - var[blockIdx.x] = static_cast(pair.second_ / feature_size - tmp * tmp); + mean[blockIdx.x] = static_cast(tmp); + var[blockIdx.x] = static_cast(pair.second_ / feature_size - tmp * tmp); } __syncthreads(); mean_val = mean[blockIdx.x]; - var_val = static_cast(real_sqrt(var[blockIdx.x] + epsilon)); + var_val = static_cast(real_sqrt(var[blockIdx.x]) + epsilon); // Step 2: Calculate y if (scale != nullptr) { if (bias != nullptr) { for (int i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { - y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j]; + y[i] = static_cast( + scale[j] * (static_cast(x[i]) - mean_val) / var_val + bias[j]); } } else { for (int i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { - y[i] = scale[j] * (x[i] - mean_val) / var_val; + y[i] = static_cast(scale[j] * (static_cast(x[i]) - mean_val) / + var_val); } } } else { // scale == nullptr if (bias != nullptr) { for (int i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { - y[i] = (x[i] - mean_val) / var_val + bias[j]; + y[i] = static_cast((static_cast(x[i]) - mean_val) / var_val + + bias[j]); } } else { for (int i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { - y[i] = (x[i] - mean_val) / var_val; + y[i] = static_cast((static_cast(x[i]) - mean_val) / var_val); } } } @@ -157,35 +170,37 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias, // Make sure that d_scale != nullptr && d_bias != nullptr // Since d_scale != nullptr, scale would not be nullptr -template +template __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, - T *d_scale, T *d_bias, T *d_x, - const T *mean, const T *var, - const T *scale, float epsilon, + U *d_scale, U *d_bias, T *d_x, + const U *mean, const U *var, + const U *scale, float epsilon, int batch_size, int feature_size, int col_offset) { - using BlockReduce = cub::BlockReduce, BlockDim>; + using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; int beg_idx = threadIdx.x * feature_size + (blockIdx.x + col_offset); int end_idx = batch_size * feature_size + (blockIdx.x + col_offset); int stride = BlockDim * feature_size; - T d_scale_partial = 0, d_bias_partial = 0; + U d_scale_partial = static_cast(0), d_bias_partial = static_cast(0); for (int i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; - auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); - d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; - d_bias_partial += d_y[i]; + auto var_val = real_sqrt(static_cast(var[row_idx]) + epsilon); + d_scale_partial += static_cast(d_y[i]) * + (static_cast(x[i]) - mean[row_idx]) / var_val; + d_bias_partial += static_cast(d_y[i]); if (HasDx) { - d_x[i] = d_y[i] * scale[blockIdx.x + col_offset] / var_val; + d_x[i] = static_cast(static_cast(d_y[i]) * + scale[blockIdx.x + col_offset] / var_val); } } auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), - PairForLayerNormAddFunctor()); + .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), + PairForLayerNormAddFunctor()); if (threadIdx.x == 0) { d_scale[blockIdx.x + col_offset] = pair.first_; @@ -196,32 +211,36 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, // Make sure that there is only one true expression: d_scale != nullptr // or d_bias != nullptr // Notice: scale may be nullptr -template +template __global__ void LayerNormBackwardGradientScaleOrBias( - const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean, - const T *var, const T *scale, float epsilon, int batch_size, + const T *x, const T *d_y, U *d_scale, U *d_bias, T *d_x, const U *mean, + const U *var, const U *scale, float epsilon, int batch_size, int feature_size, int col_offset) { - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int beg_idx = threadIdx.x * feature_size + blockIdx.x + col_offset; int end_idx = batch_size * feature_size + blockIdx.x + col_offset; int stride = BlockDim * feature_size; - T d_scale_or_d_bias_partial = 0; + U d_scale_or_d_bias_partial = static_cast(0); for (int i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; - auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + auto var_val = + static_cast(real_sqrt(static_cast(var[row_idx]) + epsilon)); if (HasDScale) { - d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + d_scale_or_d_bias_partial += static_cast(d_y[i]) * + (static_cast(x[i]) - mean[row_idx]) / + var_val; } else { // d_bias != nullptr - d_scale_or_d_bias_partial += d_y[i]; + d_scale_or_d_bias_partial += static_cast(d_y[i]); } if (HasDx) { if (scale != nullptr) { - d_x[i] = d_y[i] * scale[blockIdx.x + col_offset] / var_val; + d_x[i] = static_cast(static_cast(d_y[i]) * + scale[blockIdx.x + col_offset] / var_val); } else { - d_x[i] = d_y[i] / var_val; + d_x[i] = static_cast(static_cast(d_y[i]) / var_val); } } } @@ -238,120 +257,133 @@ __global__ void LayerNormBackwardGradientScaleOrBias( } } -template +template __global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, - const T *mean, - const T *var, + const U *mean, + const U *var, float epsilon, int feature_size) { - using BlockReduce = cub::BlockReduce, BlockDim>; + using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T d_x_reduce_tmp[2]; + __shared__ U d_x_reduce_tmp[2]; int beg_idx = blockIdx.x * feature_size + threadIdx.x; int end_idx = (blockIdx.x + 1) * feature_size; - T block_mean = mean[blockIdx.x]; - T block_var = var[blockIdx.x]; - T d_x_mean_partial = 0, d_x_var_partial = 0; + U block_mean = mean[blockIdx.x]; + U block_var = var[blockIdx.x]; + U d_x_mean_partial = static_cast(0), d_x_var_partial = static_cast(0); for (int i = beg_idx; i < end_idx; i += BlockDim) { - d_x_mean_partial += d_x[i]; - d_x_var_partial += d_x[i] * (x[i] - block_mean); + d_x_mean_partial += static_cast(d_x[i]); + d_x_var_partial += + static_cast(d_x[i]) * (static_cast(x[i]) - block_mean); } auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), - PairForLayerNormAddFunctor()); + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); if (threadIdx.x == 0) { - d_x_reduce_tmp[0] = pair.first_ / feature_size; - d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + d_x_reduce_tmp[0] = static_cast(pair.first_) / feature_size; + d_x_reduce_tmp[1] = + static_cast(pair.second_) / + (feature_size * (static_cast(block_var) + epsilon)); } __syncthreads(); d_x_mean_partial = d_x_reduce_tmp[0]; d_x_var_partial = d_x_reduce_tmp[1]; for (int i = beg_idx; i < end_idx; i += BlockDim) { - d_x[i] -= d_x_mean_partial; - d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + d_x[i] -= static_cast(d_x_mean_partial); + d_x[i] -= + static_cast((static_cast(x[i]) - block_mean) * d_x_var_partial); } } // Here, we only calculate d_x -template +template __global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, - T *d_x, const T *mean, - const T *var, const T *scale, + T *d_x, const U *mean, + const U *var, const U *scale, float epsilon, int feature_size) { - using BlockReduce = cub::BlockReduce, BlockDim>; + using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T d_x_reduce_tmp[2]; + __shared__ U d_x_reduce_tmp[2]; int beg_idx = blockIdx.x * feature_size + threadIdx.x; int end_idx = (blockIdx.x + 1) * feature_size; - T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; - T d_x_mean_partial = 0, d_x_var_partial = 0; + U block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; + U d_x_mean_partial = static_cast(0), d_x_var_partial = static_cast(0); for (int i = beg_idx; i < end_idx; i += BlockDim) { - auto var_val = static_cast(real_sqrt(block_var + epsilon)); + auto var_val = + static_cast(real_sqrt(static_cast(block_var) + epsilon)); if (scale != nullptr) { int col_idx = i % feature_size; - d_x[i] = d_y[i] * scale[col_idx] / var_val; + d_x[i] = + static_cast(static_cast(d_y[i]) * scale[col_idx] / var_val); } else { - d_x[i] = d_y[i] / var_val; + d_x[i] = static_cast(static_cast(d_y[i]) / var_val); } - d_x_mean_partial += d_x[i]; - d_x_var_partial += d_x[i] * (x[i] - block_mean); + d_x_mean_partial += static_cast(d_x[i]); + d_x_var_partial += + static_cast(d_x[i]) * (static_cast(x[i]) - block_mean); } auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), - PairForLayerNormAddFunctor()); + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); if (threadIdx.x == 0) { - d_x_reduce_tmp[0] = pair.first_ / feature_size; - d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + d_x_reduce_tmp[0] = static_cast(pair.first_) / feature_size; + d_x_reduce_tmp[1] = + static_cast(pair.second_) / + (feature_size * (static_cast(block_var) + epsilon)); } __syncthreads(); d_x_mean_partial = d_x_reduce_tmp[0]; d_x_var_partial = d_x_reduce_tmp[1]; for (int i = beg_idx; i < end_idx; i += BlockDim) { - d_x[i] -= d_x_mean_partial; - d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + d_x[i] -= static_cast(d_x_mean_partial); + d_x[i] -= + static_cast((static_cast(x[i]) - block_mean) * d_x_var_partial); } } -template +template __global__ void LayerNormBackwardWhenBatchSizeIsOne( - const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean, - const T *var, const T *scale, float epsilon, int feature_size) { + const T *x, const T *d_y, T *d_x, U *d_scale, U *d_bias, const U *mean, + const U *var, const U *scale, float epsilon, int feature_size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < feature_size) { - auto var_val = static_cast(real_sqrt(var[idx] + epsilon)); + auto var_val = + static_cast(real_sqrt(static_cast(var[idx]) + epsilon)); if (d_x != nullptr) { if (d_scale == nullptr) { - d_x[idx] = d_y[idx] / var_val; + d_x[idx] = static_cast(static_cast(d_y[idx]) / var_val); } else { - d_x[idx] = d_y[idx] * scale[idx] / var_val; + d_x[idx] = + static_cast(static_cast(d_y[idx]) * scale[idx] / var_val); } } if (d_scale != nullptr) { - d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val; + d_scale[idx] = static_cast(d_y[idx]) * + (static_cast(x[idx]) - mean[idx]) / var_val; } - if (d_bias != nullptr) d_bias[idx] = d_y[idx]; + if (d_bias != nullptr) d_bias[idx] = static_cast(d_y[idx]); } } -template -static void LayerNormBackward(const T *x, const T *d_y, const T *scale, - const T *mean, const T *var, T *d_x, T *d_scale, - T *d_bias, float epsilon, int batch_size, +template +static void LayerNormBackward(const T *x, const T *d_y, const U *scale, + const U *mean, const U *var, T *d_x, U *d_scale, + U *d_bias, float epsilon, int batch_size, int feature_size, cudaStream_t stream) { const int kMaxBlockDim = 512; const int kMaxBlockNum = 128; @@ -362,14 +394,14 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, if (batch_size == 1) { LayerNormBackwardWhenBatchSizeIsOne< - T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, - stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, - feature_size); + T, U><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, + 0, stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, + epsilon, feature_size); if (d_x != nullptr) { switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX< - T, kBlockDim><<<1, kBlockDim, 0, stream>>>( + T, U, kBlockDim><<<1, kBlockDim, 0, stream>>>( x, d_x, mean, var, epsilon, feature_size)); } } @@ -383,7 +415,7 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientScaleOrBias< - T, kBlockDim, false, + T, U, kBlockDim, false, false><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); @@ -394,7 +426,8 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientScaleOrBias< - T, kBlockDim, false, true><<>>( + T, U, kBlockDim, false, + true><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } @@ -404,7 +437,7 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientAll< - T, kBlockDim, false><<>>( + T, U, kBlockDim, false><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } @@ -413,7 +446,7 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( LayerNormBackwardGradientOnlyDX< - T, kBlockDim><<>>( + T, U, kBlockDim><<>>( x, d_y, d_x, mean, var, scale, epsilon, feature_size)); } break; @@ -422,14 +455,15 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientScaleOrBias< - T, kBlockDim, true, false><<>>( + T, U, kBlockDim, true, + false><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( LayerNormBackwardPostProcessToCalculateDX< - T, kBlockDim><<>>( + T, U, kBlockDim><<>>( x, d_x, mean, var, epsilon, feature_size)); } break; @@ -438,14 +472,15 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientScaleOrBias< - T, kBlockDim, true, true><<>>( + T, U, kBlockDim, true, + true><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( LayerNormBackwardPostProcessToCalculateDX< - T, kBlockDim><<>>( + T, U, kBlockDim><<>>( x, d_x, mean, var, epsilon, feature_size)); } break; @@ -454,14 +489,14 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale, FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE( feature_size, kMaxBlockNum, LayerNormBackwardGradientAll< - T, kBlockDim, true><<>>( + T, U, kBlockDim, true><<>>( x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, feature_size, col_offset)); } switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( LayerNormBackwardPostProcessToCalculateDX< - T, kBlockDim><<>>( + T, U, kBlockDim><<>>( x, d_x, mean, var, epsilon, feature_size)); } break; @@ -483,7 +518,7 @@ void LayerNormDirectCUDAFunctor::operator()(cudaStream_t stream, int feature_size = static_cast(matrix_dim[1]); switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - LayerNormForward<<>>( + LayerNormForward<<>>( input, scale, bias, output, mean, variance, eps, feature_size)); default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -511,10 +546,12 @@ class LayerNormKernel const auto x_dims = x->dims(); auto *x_data = x->data(); auto *y_data = y->mutable_data(ctx.GetPlace()); - auto *mean_data = mean->mutable_data(ctx.GetPlace()); - auto *var_data = var->mutable_data(ctx.GetPlace()); - auto *scale_data = (scale == nullptr ? nullptr : scale->data()); - auto *bias_data = (bias == nullptr ? nullptr : bias->data()); + auto *mean_data = mean->mutable_data>(ctx.GetPlace()); + auto *var_data = var->mutable_data>(ctx.GetPlace()); + auto *scale_data = + (scale == nullptr ? nullptr : scale->data>()); + auto *bias_data = + (bias == nullptr ? nullptr : bias->data>()); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); int batch_size = static_cast(matrix_dim[0]); @@ -524,7 +561,8 @@ class LayerNormKernel switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( - LayerNormForward<<>>( + LayerNormForward, + kBlockDim><<>>( x_data, scale_data, bias_data, y_data, mean_data, var_data, epsilon, feature_size)); default: @@ -540,6 +578,7 @@ class LayerNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + using U = LayerNormParamType; const float epsilon = ctx.Attr("epsilon"); // d_x, d_scale, d_bias may be nullptr auto *d_x = ctx.Output(framework::GradVarName("X")); @@ -554,14 +593,15 @@ class LayerNormGradKernel auto *x_data = x->data(); auto *d_y_data = d_y->data(); - auto *mean_data = mean->data(); - auto *var_data = var->data(); - auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *mean_data = mean->data(); + auto *var_data = var->data(); + + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); auto *d_scale_data = (d_scale == nullptr ? nullptr - : d_scale->mutable_data(ctx.GetPlace())); + : d_scale->mutable_data(ctx.GetPlace())); auto *d_bias_data = - (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); + (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); auto *d_x_data = (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); @@ -573,12 +613,14 @@ class LayerNormGradKernel auto stream = ctx.cuda_device_context().stream(); - LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, - d_x_data, d_scale_data, d_bias_data, epsilon, - batch_size, feature_size, stream); + LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, + d_x_data, d_scale_data, d_bias_data, epsilon, + batch_size, feature_size, stream); } }; + template class LayerNormDirectCUDAFunctor; + #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE #undef FIXED_BLOCK_DIM_CASE_BASE @@ -587,11 +629,15 @@ template class LayerNormDirectCUDAFunctor; } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( layer_norm, ops::LayerNormKernel, - ops::LayerNormKernel); + ops::LayerNormKernel, + ops::LayerNormKernel); REGISTER_OP_CUDA_KERNEL( layer_norm_grad, ops::LayerNormGradKernel, - ops::LayerNormGradKernel); + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 8c467a4969e..a92d8f17db1 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -109,9 +109,11 @@ gray_list = { 'elementwise_mod', 'elementwise_floordiv', 'batch_norm', + 'layer_norm', 'tanh', 'sigmoid', 'lookup_table', + 'lookup_table_v2', 'top_k', 'pool2d', 'pool3d', @@ -123,6 +125,7 @@ gray_list = { 'flatten2', 'stack', 'unstack', + 'uniform_random', 'uniform_random_batch_size_like', 'gaussian_random', 'gaussian_random_batch_size_like', @@ -192,7 +195,6 @@ unsupported_fp16_list = { 'sequence_concat', 'sequence_slice', 'data_norm', - 'layer_norm', 'group_norm', 'spectral_norm', 'depthwise_conv2d_transpose', diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 1d9f8af1020..99a1be82ab7 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -70,7 +70,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): for in_name in op.input_names: if src_dtype == core.VarDesc.VarType.FP32 and op.type in [ - 'batch_norm', 'fused_bn_add_activation' + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' ]: if in_name not in {'X', 'Z'}: continue @@ -104,8 +104,9 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): op._set_attr('in_dtype', dest_dtype) if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16: for out_name in op.output_names: - if op.type in ['batch_norm', 'fused_bn_add_activation' - ] and out_name != 'Y': + if op.type in [ + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' + ] and out_name != 'Y': continue for out_var_name in op.output(out_name): out_var = block.var(out_var_name) diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index d2c07c185dd..d17942fe3be 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle from operator import mul import paddle.fluid.core as core @@ -210,7 +211,7 @@ class TestLayerNormOp(unittest.TestCase): for name in ['x', 'scale', 'bias', 'y@GRAD'] }, fetch_list=fetch_list) - self.__assert_close(y, out[0], "y") + self.__assert_close(y, out[0], "y", 1e-3) self.__assert_close(mean, out[1], "mean") self.__assert_close(variance, out[2], "variance", 1e-3) self.__assert_close(x_grad, out[3], "x_grad") @@ -310,6 +311,8 @@ class TestLayerNormAPI(unittest.TestCase): class TestDygraphLayerNormAPIError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): + paddle.enable_static() + layer_norm = fluid.LayerNorm([32, 32]) # the input of LayerNorm must be Variable. x1 = np.random.random((3, 32, 32)).astype('float32') diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index efde54182e5..32c7a030315 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -293,7 +293,8 @@ def layer_norm(x, 'begin_norm_axis', begin_norm_axis) return dygraph_utils._append_activation_in_dygraph(pre_act, act=None) - check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm') + check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], + 'LayerNorm') inputs = dict() inputs['X'] = [x] @@ -305,11 +306,13 @@ def layer_norm(x, # create output helper = LayerHelper('layer_norm', **locals()) + + dtype = x.dtype mean_out = helper.create_variable_for_type_inference( - dtype=x.dtype, stop_gradient=True) + dtype=dtype, stop_gradient=True) variance_out = helper.create_variable_for_type_inference( - dtype=x.dtype, stop_gradient=True) - layer_norm_out = helper.create_variable_for_type_inference(x.dtype) + dtype=dtype, stop_gradient=True) + layer_norm_out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="layer_norm", -- GitLab