diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4c3b8ec78190723598a56f7633764f10dd5047f3..b395739809dbd187e36c28b3f609a0a08c839643 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -273,9 +273,9 @@ op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) - if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) + op_library(layer_norm_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 6840e1e08f3d5bc84a05f15e30982c7cfb59680b..0886c41a1b582881faf24f5531d414db4e4db71c 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/layer_norm_op.h" +namespace paddle { +namespace operators { + +inline static int GetDesiredBlockDim(int block_dim) { + const int kMaxBlockDim = 512; + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << (static_cast(std::log2f(block_dim)))); +} + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__) + +static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); } +static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); } + +template +struct PairForLayerNorm { + __device__ __forceinline__ PairForLayerNorm() {} + __device__ __forceinline__ PairForLayerNorm(const T &first, const T &second) + : first_(first), second_(second) {} + + T first_; + T second_; +}; + +template +struct PairForLayerNormAddFunctor { + __device__ __forceinline__ PairForLayerNorm operator()( + const PairForLayerNorm &p1, const PairForLayerNorm &p2) { + return PairForLayerNorm(p1.first_ + p2.first_, p1.second_ + p2.second_); + } +}; + +template +__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, + T *y, T *mean, T *var, float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + // Step 1: Reduce to calculate mean and var + T mean_val = static_cast(0); + T var_val = static_cast(0); + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T tmp = x[i]; + mean_val += tmp; + var_val += (tmp * tmp); + } + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(mean_val, var_val), + PairForLayerNormAddFunctor()); + if (threadIdx.x == 0) { + auto tmp = pair.first_ / feature_size; + mean[blockIdx.x] = tmp; + var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; + } + __syncthreads(); + mean_val = mean[blockIdx.x]; + var_val = static_cast(real_sqrt(var[blockIdx.x] + epsilon)); + + // Step 2: Calculate y + if (scale != nullptr) { + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = scale[j] * (x[i] - mean_val) / var_val; + } + } + } else { // scale == nullptr + if (bias != nullptr) { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val + bias[j]; + } + } else { + for (int i = beg_idx, j = threadIdx.x; i < end_idx; + i += BlockDim, j += BlockDim) { + y[i] = (x[i] - mean_val) / var_val; + } + } + } +} + +// Make sure that d_scale != nullptr && d_bias != nullptr +// Since d_scale != nullptr, scale would not be nullptr +template +__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, + T *d_scale, T *d_bias, T *d_x, + const T *mean, const T *var, + const T *scale, float epsilon, + int batch_size, int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + + T d_scale_partial = 0, d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + d_bias_partial += d_y[i]; + if (HasDx) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } + } + + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_scale[blockIdx.x] = pair.first_; + d_bias[blockIdx.x] = pair.second_; + } +} + +// Make sure that there is only one true expression: d_scale != nullptr +// or d_bias != nullptr +// Notice: scale may be nullptr +template +__global__ void LayerNormBackwardGradientScaleOrBias( + const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean, + const T *var, const T *scale, float epsilon, int batch_size, + int feature_size) { + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int beg_idx = threadIdx.x * feature_size + blockIdx.x; + int end_idx = batch_size * feature_size + blockIdx.x; + int stride = BlockDim * feature_size; + T d_scale_or_d_bias_partial = 0; + + for (int i = beg_idx; i < end_idx; i += stride) { + int row_idx = i / feature_size; + auto var_val = static_cast(real_sqrt(var[row_idx] + epsilon)); + if (HasDScale) { + d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val; + } else { // d_bias != nullptr + d_scale_or_d_bias_partial += d_y[i]; + } + + if (HasDx) { + if (scale != nullptr) { + d_x[i] = d_y[i] * scale[blockIdx.x] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + } + } + + d_scale_or_d_bias_partial = + BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum()); + + if (threadIdx.x == 0) { + if (HasDScale) { + d_scale[blockIdx.x] = d_scale_or_d_bias_partial; + } else { + d_bias[blockIdx.x] = d_scale_or_d_bias_partial; + } + } +} + +template +__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, + const T *mean, + const T *var, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x]; + T block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +// Here, we only calculate d_x +template +__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, + T *d_x, const T *mean, + const T *var, const T *scale, + float epsilon, + int feature_size) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T d_x_reduce_tmp[2]; + + int beg_idx = blockIdx.x * feature_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * feature_size; + + T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; + T d_x_mean_partial = 0, d_x_var_partial = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + auto var_val = static_cast(real_sqrt(block_var + epsilon)); + if (scale != nullptr) { + int col_idx = i % feature_size; + d_x[i] = d_y[i] * scale[col_idx] / var_val; + } else { + d_x[i] = d_y[i] / var_val; + } + d_x_mean_partial += d_x[i]; + d_x_var_partial += d_x[i] * (x[i] - block_mean); + } + + auto pair = + BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(d_x_mean_partial, d_x_var_partial), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + d_x_reduce_tmp[0] = pair.first_ / feature_size; + d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon)); + } + __syncthreads(); + + d_x_mean_partial = d_x_reduce_tmp[0]; + d_x_var_partial = d_x_reduce_tmp[1]; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + d_x[i] -= d_x_mean_partial; + d_x[i] -= (x[i] - block_mean) * d_x_var_partial; + } +} + +template +__global__ void LayerNormBackwardWhenBatchSizeIsOne( + const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean, + const T *var, const T *scale, float epsilon, int feature_size) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < feature_size) { + auto var_val = static_cast(real_sqrt(var[idx] + epsilon)); + if (d_x != nullptr) { + if (d_scale == nullptr) { + d_x[idx] = d_y[idx] / var_val; + } else { + d_x[idx] = d_y[idx] * scale[idx] / var_val; + } + } + + if (d_scale != nullptr) { + d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val; + } + + if (d_bias != nullptr) d_bias[idx] = d_y[idx]; + } +} + +template +static void LayerNormBackward(const T *x, const T *d_y, const T *scale, + const T *mean, const T *var, T *d_x, T *d_scale, + T *d_bias, float epsilon, int batch_size, + int feature_size, cudaStream_t stream) { + const int kMaxBlockDim = 512; + int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | + ((d_scale != nullptr ? 1 : 0) << 1) | + ((d_bias != nullptr ? 1 : 0)); + if (gradient_flag == 0) return; + + if (batch_size == 1) { + LayerNormBackwardWhenBatchSizeIsOne< + T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0, + stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon, + feature_size); + + if (d_x != nullptr) { + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<<1, kBlockDim, 0, stream>>>( + x, d_x, mean, var, epsilon, feature_size)); + } + } + return; + } + + auto block_dim = GetDesiredBlockDim(batch_size); + switch (gradient_flag) { + case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, false, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + break; + case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + break; + case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientOnlyDX< + T, kBlockDim><<>>( + x, d_y, d_x, mean, var, scale, epsilon, feature_size)); + } + break; + case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + false><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias< + T, kBlockDim, true, + true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size, + feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr + switch (block_dim) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardGradientAll< + T, kBlockDim, true><<>>( + x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, + batch_size, feature_size)); + } + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormBackwardPostProcessToCalculateDX< + T, kBlockDim><<>>( + x, d_x, mean, var, epsilon, feature_size)); + } + break; + default: + break; + } +} + +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto *x = ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x->dims(); + auto *x_data = x->data(); + auto *y_data = y->mutable_data(ctx.GetPlace()); + auto *mean_data = mean->mutable_data(ctx.GetPlace()); + auto *var_data = var->mutable_data(ctx.GetPlace()); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *bias_data = (bias == nullptr ? nullptr : bias->data()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + switch (GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward<<>>( + x_data, scale_data, bias_data, y_data, mean_data, var_data, + epsilon, feature_size)); + default: + PADDLE_THROW( + "Product from begin_norm_axis to end must be larger than 1"); + break; + } + } +}; + +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + // d_x, d_scale, d_bias may be nullptr + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto *x = ctx.Input("X"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *d_y = ctx.Input(framework::GradVarName("Y")); + + auto *x_data = x->data(); + auto *d_y_data = d_y->data(); + auto *mean_data = mean->data(); + auto *var_data = var->data(); + auto *scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *d_scale_data = + (d_scale == nullptr ? nullptr + : d_scale->mutable_data(ctx.GetPlace())); + auto *d_bias_data = + (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); + auto *d_x_data = + (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); + + const auto &x_dims = x->dims(); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int batch_size = static_cast(matrix_dim[0]); + int feature_size = static_cast(matrix_dim[1]); + + auto stream = ctx.cuda_device_context().stream(); + + LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, + d_x_data, d_scale_data, d_bias_data, epsilon, + batch_size, feature_size, stream); + } +}; + +#undef FIXED_BLOCK_DIM_CASE_BASE +#undef FIXED_BLOCK_DIM_CASE +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( layer_norm,