[cherry-pick] Fix bn performance degradation (#50382)

att, cherry-pick: #48563 , #50287

[cherry-pick] Fix bn performance degradation (#50382)
att, cherry-pick: #48563 , #50287
eb610740 · zhangkaihuo · GitHub · 59fec5d6 · eb610740
隐藏空白更改
内联并排

Showing with 22 addition and 5 deletion

paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +22 -5

未找到文件。
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -855,7 +855,8 @@ void BatchNormGradRawKernel(const Context &ctx,
    }
    // CUDNN only support small batch size
    bool use_native_nhwc =
-        d_x ? (x_dims.size() == 4 && compute_format == DataLayout::kNHWC)
+        d_x ? (x_dims.size() == 4 && compute_format == DataLayout::kNHWC &&
+               H * W >= CUDNN_SPATIAL_THRESHOLD_EVAL)
            : false;
    const bool use_native_kernel =
        ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
@@ -933,6 +934,21 @@ void BatchNormGradRawKernel(const Context &ctx,
                    flag_ptr);
          }
          // 2. reduce_sum(x, dy, mean) => dscale, dbias
+          BatchNormParamType<T> *dscale = nullptr;
+          BatchNormParamType<T> *dbias = nullptr;
+          bool with_scale = false;
+          if (d_scale && d_bias) {
+            dscale = ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+            dbias = ctx.template Alloc<BatchNormParamType<T>>(d_bias);
+          } else {
+            DenseTensor dscale_mem =
+                phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+            DenseTensor dbias_mem =
+                phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+            dscale = dscale_mem.data<BatchNormParamType<T>>();
+            dbias = dbias_mem.data<BatchNormParamType<T>>();
+          }
          BNBackward2DChannelLastStage2<T, block_size>
              <<<grid, block, 0, ctx.stream()>>>(
                  transformed_d_y.template data<T>(),
@@ -944,8 +960,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                  H * W * D,
                  epsilon,
                  block_data_ptr,
-                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                  dscale,
-                  ctx.template Alloc<BatchNormParamType<T>>(d_bias),
+                  dbias,
                  flag_ptr);
          // 3. elementwise_mul(scale, mean, inv_var, dy, dscale, dbias) => dx
@@ -954,8 +970,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                  transformed_d_y.template data<T>(),
                  transformed_x.template data<T>(),
                  scale.template data<BatchNormParamType<T>>(),
-                  d_scale->data<BatchNormParamType<T>>(),
+                  dscale,
-                  d_bias->data<BatchNormParamType<T>>(),
+                  dbias,
                  mean_ptr,
                  variance_ptr,
                  C,
@@ -1165,6 +1181,7 @@ void BatchNormGradRawKernel(const Context &ctx,
        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
            bn_param_desc_));
 #endif
  } else {
    const auto *running_mean = mean.get_ptr();
    const auto *running_var = variance.get_ptr();