diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 4aedf4eb79bd10587f9e2d2b1305839b1677bb5f..301b62524a54dda8c4abba23983b58cc36090d76 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -573,7 +573,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = mu_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { mu_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { @@ -608,7 +608,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = var_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { var_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index e37f048235e7c6dbe192d2c958b88c4c75039dc2..8ed706a5443af39fe19f198f9083998b428ff10d 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -252,7 +252,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = mu_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { mu_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { @@ -289,7 +289,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = var_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { var_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) {