diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index aa613dd3f5ce00a76c4d2b95e9b1361f112e7b49..866de8e04a9bc88cefc78e965f7711c34bfa5c10 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -298,10 +298,16 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( for (int it = 0, col = c; it < LDGS; it++) { phi::Store( x[it], residual_out_ptr + row * ELTS_PER_ROW + col * VecSize); - phi::Store( - mask_vec[it], mask_out_ptr + row * ELTS_PER_ROW + col * VecSize); col += THREADS_PER_ROW; } + if (!is_test) { +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + phi::Store( + mask_vec[it], mask_out_ptr + row * ELTS_PER_ROW + col * VecSize); + col += THREADS_PER_ROW; + } + } U mu_local = 0.f; #pragma unroll