Optimize batchnorm1d using 2D kernel (#43530)

1bc47c84 · Yao Zihang · GitHub · a2c4c86b · 1bc47c84 · 1bc47c84
3 changed file
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -591,10 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx,
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-      // CUDNN PER_ACTIVATION mode only support small batch size
+      // CUDNN only support small batch size
      const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
      const bool use_native_kernel =
-          (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
      if (use_native_kernel) {
        if (compute_format == DataLayout::kNCHW) {
          BNBackward<T, block, DataLayout::kNCHW>

--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -82,50 +82,58 @@ class TestBatchNorm(unittest.TestCase):
                self.assertRaises(ValueError, error2d_dataformat)
                self.assertRaises(ValueError, error3d_dataformat)

-    def test_eager_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            shape = [4, 10, 4, 4]
+    def test_large_batch(self):

-            def compute_v1(x):
+        def compute_baseline(x):
            with fluid.dygraph.guard(p):
                bn = fluid.dygraph.BatchNorm(shape[1])
-                    #bn = paddle.nn.BatchNorm2D(shape[1])
                x1 = paddle.to_tensor(x)
                x1.stop_gradient = False
                y = bn(x1)
                y.backward()
                return y.numpy(), x1.gradient()

-            def compute_v2(x):
+        def compute_1d(x):
            with fluid.dygraph.guard(p):
                with _test_eager_guard():
-                        print("v2")
-                        bn = paddle.nn.BatchNorm2D(shape[1])
+                    bn = paddle.nn.BatchNorm1D(shape[1])
                    x1 = paddle.to_tensor(x)
                    x1.stop_gradient = False
                    y = bn(x1)
                    y.backward()
                    return y.numpy(), x1.gradient()

+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            # [N, C]
+            shape = [200000, 4]
+            x = np.random.randn(*shape).astype("float32")
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
+            self.assertTrue(np.allclose(g1, g2))
+            self.assertTrue(np.allclose(y1, y2))
+
+            # [N, C, L]
+            shape = [1000000, 4, 4]
            x = np.random.randn(*shape).astype("float32")
-            y1, g1 = compute_v1(x)
-            y2, g2 = compute_v2(x)
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
            self.assertTrue(np.allclose(g1, g2))
            self.assertTrue(np.allclose(y1, y2))

-    def test_eager_api_1d(self):
+    def test_eager_api(self):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
-            shape = [200000, 4]
+            shape = [4, 10, 4, 4]

            def compute_v1(x):
                with fluid.dygraph.guard(p):
                    bn = fluid.dygraph.BatchNorm(shape[1])
+                    #bn = paddle.nn.BatchNorm2D(shape[1])
                    x1 = paddle.to_tensor(x)
                    x1.stop_gradient = False
                    y = bn(x1)
@@ -135,7 +143,8 @@ class TestBatchNorm(unittest.TestCase):
            def compute_v2(x):
                with fluid.dygraph.guard(p):
                    with _test_eager_guard():
-                        bn = paddle.nn.BatchNorm1D(shape[1])
+                        print("v2")
+                        bn = paddle.nn.BatchNorm2D(shape[1])
                        x1 = paddle.to_tensor(x)
                        x1.stop_gradient = False
                        y = bn(x1)