未验证 提交 99cfcc09 编写于 作者: L Li Min 提交者: GitHub

Optimize layer norm backward cuda kernel when cols is 1024. (#39247)

* Add fp16 support for scale/bias for fused_layernnorm_residual_dropout_bias op.

* Remove useless code.

* Remove useless code.

* Optimize layer_norm fwd when cols is 1024.

* Remove useless code.

* Minors.

* Minors.

* Modifications accordding to reviews.

* Minors.

* Optimize layer_norm bwd kernel when cols is 1024.

* Polish layer_norm_bwd_1024 kernel.

* Limit ln_bwd_1024_kernel to paddle_with_cuda.

* Fix double type compile error.

* Add optimization of ln bwd for fused_dropout_add_ln op.

* Polish codes.
上级 92da5055
......@@ -284,11 +284,30 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
P* d_layernorm_bias, T* d_dropout_src,
T* d_bias, T* d_residual) {
using U = LayerNormParamType<T>;
LayerNormBackward<T, U, is_same_type>(
layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
d_residual, d_bias);
bool can_call_1024_kernel = false;
// Fast impl for cases when cols is 1024 and linear_bias is nullptr.
// In fact, linear_bias is not nullptr is also feasible for impl.
// Here, we do not support it.
if (this->cols_ == 1024 && d_bias == nullptr && d_scale != nullptr &&
d_layernorm_bias != nullptr && sizeof(T) <= 4) {
can_call_1024_kernel = true;
}
VLOG(6) << "LaunchLayernormResidualDropoutGrad = " << can_call_1024_kernel;
if (can_call_1024_kernel) {
LaunchLayernormResidualDropoutGrad<T, U, MaskType, is_same_type>(
ctx, this->rows_, this->cols_, epsilon_,
this->dropout_param_.dropout_prob,
this->dropout_param_.is_upscale_in_train, d_out, layernorm_src, gamma,
mean, variance, mask, d_scale, d_layernorm_bias, d_residual,
d_dropout_src);
} else {
LayerNormBackward<T, U, is_same_type>(
layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
d_residual, d_bias);
}
}
protected:
......
......@@ -441,5 +441,30 @@ void LaunchLayernormResidualDropoutBias(
}
}
template <typename T, typename U, typename MaskType,
bool ScaleBiasWithSameTypeX = false>
void LaunchLayernormResidualDropoutGrad(
const platform::CUDADeviceContext &dev_ctx, const uint32_t rows,
const uint32_t cols, const float epsilon, const float dropout_prob,
const bool is_upscale_in_train, const T *d_out, const T *layernorm_src,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
const LayerNormParamType<T> *mean, const LayerNormParamType<T> *var,
const MaskType *mask_data,
LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_layernorm_bias,
T *d_residual, T *d_dropout_src) {
const T zero = static_cast<T>(0.0f);
auto factor = dropout_prob == static_cast<float>(1.0f)
? zero
: static_cast<T>(1.0f / (1.0f - dropout_prob));
if (!is_upscale_in_train) {
factor = static_cast<T>(1.0f);
}
ln_bwd_1024_kernel_driver<
T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, MaskType>(
dev_ctx, rows, cols, epsilon, layernorm_src, scale, mean, var, d_out,
d_residual, d_scale, d_layernorm_bias, mask_data, factor, d_dropout_src);
}
} // namespace operators
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册