Fix nan in fast_ln_fwd_kernel when cols > 1024 (#44125)

* Fix nan in fast_ln_fwd_kernel when cols > 1024 * delete blas

Fix nan in fast_ln_fwd_kernel when cols > 1024 (#44125)
* Fix nan in fast_ln_fwd_kernel when cols > 1024 * delete blas
33540e10 · Zhang Zheng · GitHub · 9428c969 · 33540e10 · 33540e10
2 changed file
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -573,7 +573,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
        smem[warp_m * WARPS_N + warp_n] = mu_local;
      }
      __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
        mu_local = 0.f;
 #pragma unroll
        for (int it = 0; it < WARPS_N; ++it) {
@@ -608,7 +608,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
        smem[warp_m * WARPS_N + warp_n] = var_local;
      }
      __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
        var_local = 0.f;
 #pragma unroll
        for (int it = 0; it < WARPS_N; ++it) {

--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -252,7 +252,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
        smem[warp_m * WARPS_N + warp_n] = mu_local;
      }
      __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
        mu_local = 0.f;
 #pragma unroll
        for (int it = 0; it < WARPS_N; ++it) {
@@ -289,7 +289,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
        smem[warp_m * WARPS_N + warp_n] = var_local;
      }
      __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
        var_local = 0.f;
 #pragma unroll
        for (int it = 0; it < WARPS_N; ++it) {