[Cherry pick] fix fold for big bs (#49491)

* fix fold for large bs * fix fold for large bs * fix pre-commit

[Cherry pick] fix fold for big bs (#49491)
* fix fold for large bs * fix fold for large bs * fix pre-commit
2a438b0a · xiaoting · GitHub · d7855fe8 · 2a438b0a · 2a438b0a
3 changed file
--- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
@@ -54,11 +54,8 @@ void FoldGradKernel(const Context& ctx,
  DDim out_shape =
      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
-  DDim input_matrix_shape = make_ddim({x_dims[0],
+  DDim input_matrix_shape = make_ddim(
-                                       kernel_sizes[0],
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
  paddle::operators::math::
      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>

--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -56,11 +56,8 @@ void FoldKernel(const Context& ctx,
  DDim output_shape =
      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
-  DDim input_matrix_shape = make_ddim({x_dims[0],
+  DDim input_matrix_shape = make_ddim(
-                                       kernel_sizes[0],
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
  phi::funcs::SetConstant<Context, T> set_zero;
  set_zero(ctx, out, static_cast<T>(0));
@@ -68,6 +65,7 @@ void FoldKernel(const Context& ctx,
  for (int i = 0; i < batch_size; i++) {
    DenseTensor out_batch =
        out->Slice(i, i + 1).Resize(output_shape);  // im size=3
    DenseTensor in_batch =
        x.Slice(i, i + 1).Resize(input_matrix_shape);  // col size=5
    col2im(ctx, in_batch, dilations, strides, paddings, &out_batch);

--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
 import math
 import numpy as np
 import unittest
@@ -45,34 +46,64 @@ class TestFoldOp(OpTest):
    def calc_fold(self):
        output_shape = [0] * 4
        output_shape[0] = self.batch_size
-        output_shape[1] = int(self.input_channels /
+        output_shape[1] = int(
-                              (self.kernel_sizes[0] * self.kernel_sizes[1]))
+            self.input_channels / (self.kernel_sizes[0] * self.kernel_sizes[1])
+        )
        output_shape[2] = self.output_sizes[0]
        output_shape[3] = self.output_sizes[1]
        dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
        dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
-        col_height = int((self.output_sizes[0] + self.paddings[0] +
+        col_height = (
-                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
+            int(
-        col_width = int((self.output_sizes[1] + self.paddings[1] +
+                (
-                         self.paddings[3] - dkernel_w) / self.strides[1]) + 1
+                    self.output_sizes[0]
+                    + self.paddings[0]
+                    + self.paddings[2]
+                    - dkernel_h
+                )
+                / self.strides[0]
+            )
+            + 1
+        )
+        col_width = (
+            int(
+                (
+                    self.output_sizes[1]
+                    + self.paddings[1]
+                    + self.paddings[3]
+                    - dkernel_w
+                )
+                / self.strides[1]
+            )
+            + 1
+        )
        output = np.zeros(output_shape).astype(np.float64)
        ############ calculate output ##############
        for b in range(output_shape[0]):
            for c in range(self.input_channels):
                w_offset = int(c % self.kernel_sizes[1])
                h_offset = int(
-                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0])
+                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0]
+                )
                c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
                for h in range(col_height):
-                    h_out = int(h * self.strides[0] - self.paddings[0] +
+                    h_out = int(
-                                h_offset * self.dilations[0])
+                        h * self.strides[0]
+                        - self.paddings[0]
+                        + h_offset * self.dilations[0]
+                    )
                    for w in range(col_width):
-                        w_out = int(w * self.strides[1] - self.paddings[1] +
+                        w_out = int(
-                                    w_offset * self.dilations[1])
+                            w * self.strides[1]
+                            - self.paddings[1]
+                            + w_offset * self.dilations[1]
+                        )
                        if (h_out >= 0 and h_out < self.output_sizes[0]) and (
-                                w_out >= 0 and w_out < self.output_sizes[1]):
+                            w_out >= 0 and w_out < self.output_sizes[1]
-                            output[b, c_out, h_out,
+                        ):
-                                   w_out] += self.x[b, c, w + col_width * h]
+                            output[b, c_out, h_out, w_out] += self.x[
+                                b, c, w + col_width * h
+                            ]
        self.outputs = output
@@ -85,7 +116,7 @@ class TestFoldOp(OpTest):
            'paddings': self.paddings,
            'dilations': self.dilations,
            'strides': self.strides,
-            'output_sizes': self.output_sizes
+            'output_sizes': self.output_sizes,
        }
        self.outputs = {'Y': self.outputs}
@@ -101,9 +132,23 @@ class TestFoldOp(OpTest):
        self.check_grad(['X'], 'Y', check_eager=True)
+class TestFoldshape(TestFoldOp):
+    def init_data(self):
+        self.batch_size = 8
+        self.input_channels = 3 * 3 * 3
+        self.length = 6
+        self.kernel_sizes = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+        self.dilations = [1, 1]
+        self.output_sizes = [4, 5]
+        input_shape = [self.batch_size, self.input_channels, self.length]
+        self.x = np.random.rand(*input_shape).astype(np.float64)
 class TestFoldAPI(TestFoldOp):
-    #This is for test on paddle.nn.Fold
+    # This is for test on paddle.nn.Fold
    def setUp(self):
        self.op_type = 'fold'
@@ -120,19 +165,19 @@ class TestFoldAPI(TestFoldOp):
                m = paddle.nn.Fold(**self.attrs)
                m.eval()
                result = m(input)
-                np.testing.assert_allclose(result.numpy(),
+                np.testing.assert_allclose(
-                                           self.outputs['Y'],
+                    result.numpy(), self.outputs['Y'], rtol=1e-05
-                                           rtol=1e-05)
+                )
    def test_info(self):
        str(paddle.nn.Fold(**self.attrs))
 class TestFoldOpError(unittest.TestCase):
    def test_errors(self):
        from paddle.nn.functional import fold
        from paddle.fluid.framework import Program, program_guard
        with program_guard(Program(), Program()):
            def test_input_shape():
@@ -148,59 +193,67 @@ class TestFoldOpError(unittest.TestCase):
            def test_padding_shape():
                # padding_size must be 2 or 4
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
+                    x,
                    output_sizes=[2, 3],
                    kernel_sizes=[2, 2],
-                           paddings=[2, 2, 3])
+                    paddings=[2, 2, 3],
+                )
            def test_dilations_shape():
                # dialtions_size must be 2
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
+                    x,
                    output_sizes=[2, 3],
                    kernel_sizes=[2, 2],
-                           dilations=[2, 2, 3])
+                    dilations=[2, 2, 3],
+                )
            def test_strides_shape():
                # strids_size must be 2
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
+                    x,
                    output_sizes=[2, 3],
                    kernel_sizes=[2, 2],
-                           strides=[2, 2, 3])
+                    strides=[2, 2, 3],
+                )
            def test_output_size():
                # im_h * im_w must be L
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[6, 6],
+                    x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]
-                           kernel_sizes=[2, 2],
+                )
-                           strides=[1, 1])
            def test_output_size_2():
                # out_size must GT 1
                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
+                out = fold(
+                    x,
                    output_sizes=[0.1, 0.2],
                    kernel_sizes=[2, 2],
-                           strides=[1, 1])
+                    strides=[1, 1],
+                )
            def test_block_h_w():
                # test_block_h_w GT 0
                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
+                out = fold(
-                           output_sizes=[1, 1],
+                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1
-                           kernel_sizes=[2, 2],
+                )
-                           strides=1)
            def test_GT_0():
                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
+                out = fold(
+                    x,
                    output_sizes=[0, 0],
                    kernel_sizes=[0, 0],
                    dilations=0,
                    paddings=[0, 0],
-                           strides=0)
+                    strides=0,
+                )
            self.assertRaises(AssertionError, test_input_shape)
            self.assertRaises(AssertionError, test_kernel_shape)