Add float16 support to `sync_batch_norm_op` (#19681)

* Add float16 support to `sync_batch_norm_op` test=develop * Add test for sync_bn with FP16 input test=develop

Add float16 support to `sync_batch_norm_op` (#19681)
* Add float16 support to `sync_batch_norm_op` test=develop * Add test for sync_bn with FP16 input test=develop
ebff68fa · Yang Zhang · GitHub · 039b9710 · ebff68fa · ebff68fa
2 changed file
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -11,6 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+test for sync bachnorm op.
+for both FP64 and FP16 input.
+"""

 from __future__ import print_function

@@ -22,9 +26,24 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler

+from op_test import OpTest
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    """Get tensor, if not found, create a new one."""
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_recursive_sequence_lengths([])
+        tensor.set(var, place)
+    return tensor
+

 class TestSyncBatchNormOpTraining(unittest.TestCase):
+    """sync_batch_norm op test."""
+
    def setUp(self):
+        """Setup."""
        #self.dtype = np.float32
        self.dtype = np.float64
        self.N = 32
@@ -32,17 +51,20 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
        self.H = 64
        self.W = 32
        self.dshape = [self.N, self.C, self.H, self.W]
+        self.atol = 1e-3

-    def build_program(self,
-                      place,
-                      layout,
-                      seed,
-                      sync_bn=False,
-                      only_forward=False):
+    def _build_program(self,
+                       place,
+                       layout,
+                       seed,
+                       sync_bn=False,
+                       only_forward=False):
+        """Build program."""
        main = fluid.Program()
        startup = fluid.Program()
        main.random_seed = seed
        startup.random_seed = seed
+        use_cudnn = self.dtype == np.float16
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                data = fluid.layers.data(
@@ -56,7 +78,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                    filter_size=1,
                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
                    bias_attr=False,
-                    use_cudnn=False)
+                    use_cudnn=use_cudnn)
                bn = fluid.layers.batch_norm(
                    conv,
                    param_attr=fluid.ParamAttr(name='bn_scale'),
@@ -65,6 +87,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
                    is_test=only_forward)
+                bn = fluid.layers.cast(bn, 'float64')
                sigmoid = fluid.layers.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                if not sync_bn:
@@ -74,13 +97,18 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                    sgd_opt.backward(out)
        return main, startup, [out, conv, bn]

-    def compare(self, place, layout, only_forward):
+    def _compare(self, place, layout, only_forward):
+        """Compare results."""
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
+        scope = core.Scope()
        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        data = create_or_get_tensor(scope, "input",
+                                    OpTest.np_dtype_to_fluid_dtype(data), place)
+
        # Single-GPU, N = 32 per GPU
-        main, startup, outs = self.build_program(place, layout, seed, False,
-                                                 only_forward)
+        main, startup, outs = self._build_program(place, layout, seed, False,
+                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
@@ -99,8 +127,8 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
        assert core.get_cuda_device_count() > 1
-        main, startup, outs = self.build_program(place, layout, seed, True,
-                                                 only_forward)
+        main, startup, outs = self._build_program(place, layout, seed, True,
+                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
@@ -133,27 +161,43 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
            self.assertTrue(
                np.allclose(
-                    bn_val, sync_bn_val, atol=1e-3),
+                    bn_val, sync_bn_val, atol=self.atol),
                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))

    def test_train(self):
+        """Test training."""
        if not core.is_compiled_with_cuda():
            return

        places = [core.CUDAPlace(0)]
        for place in places:
            for layout in ["NCHW", "NHWC"]:
-                self.compare(place, layout, False)
+                self._compare(place, layout, False)

    def test_infer(self):
+        """Test inference."""
        if not core.is_compiled_with_cuda():
            return

        places = [core.CUDAPlace(0)]
        for place in places:
            for layout in ["NCHW", "NHWC"]:
-                self.compare(place, layout, True)
+                self._compare(place, layout, True)
+
+
+class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
+    """sync_batch_norm op test for FP16 input."""
+
+    def setUp(self):
+        """Setup."""
+        self.dtype = np.float16
+        self.N = 32
+        self.C = 16
+        self.H = 64
+        self.W = 32
+        self.dshape = [self.N, self.C, self.H, self.W]
+        self.atol = 1e-2


 if __name__ == '__main__':