test_sync_batch_norm_op.py 14.7 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15 16 17
"""
test for sync bachnorm op.
for both FP64 and FP16 input.
"""
Q
qingqing01 已提交
18

19
import os
20
import random
21 22
import shutil
import sys
23
import tempfile
Q
qingqing01 已提交
24
import unittest
25
from shlex import quote
26

Q
qingqing01 已提交
27
import numpy as np
28
from decorator_helper import prog_scope
29 30 31 32 33
from eager_op_test import (
    OpTest,
    _set_use_system_allocator,
    convert_float_to_uint16,
)
34

C
ceci3 已提交
35
import paddle
36 37
from paddle import fluid, nn
from paddle.fluid import Program, core, program_guard
38
from paddle.fluid.framework import in_dygraph_mode
39 40

_set_use_system_allocator(True)
41 42


43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
def enable_static():
    if in_dygraph_mode():
        paddle.enable_static()

        def cleanup():
            paddle.disable_static()

    else:

        def cleanup():
            pass

    return cleanup


def convert_numpy_array(array):
    if array.dtype != np.uint16:
        return array

    cleanup = None
    if not in_dygraph_mode():
        paddle.disable_static()
        cleanup = lambda: paddle.enable_static()

    out = paddle.to_tensor(array).astype(paddle.float32).numpy()
    if cleanup is not None:
        cleanup()
    return out


73 74 75 76 77 78 79 80 81
def create_or_get_tensor(scope, var_name, var, place):
    """Get tensor, if not found, create a new one."""
    tensor = scope.var(var_name).get_tensor()
    if var is not None:
        assert isinstance(var, np.ndarray)
        tensor.set_recursive_sequence_lengths([])
        tensor.set(var, place)
    return tensor

Q
qingqing01 已提交
82

83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
def clean_dir(path):
    if isinstance(path, tempfile.TemporaryDirectory):
        path = path.name
    for f in os.listdir(path):
        f = os.path.join(path, f)
        if os.path.isdir(f):
            shutil.rmtree(f)
        else:
            os.remove(f)


def concat_cmd(cmd):
    if isinstance(cmd, str):
        return cmd

    return ' '.join([quote(c) for c in cmd])


Q
qingqing01 已提交
101
class TestSyncBatchNormOpTraining(unittest.TestCase):
102 103
    """sync_batch_norm op test."""

Q
qingqing01 已提交
104
    def setUp(self):
105
        """Setup."""
106
        # self.dtype = np.float32
107
        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
108
        self.N = 8
Q
qingqing01 已提交
109
        self.C = 16
110
        self.H = 32
Q
qingqing01 已提交
111 112
        self.W = 32
        self.dshape = [self.N, self.C, self.H, self.W]
113
        self.atol = 1e-3
114 115 116 117 118 119 120 121 122
        self.data_dir = tempfile.TemporaryDirectory()
        self.fleet_log_dir = tempfile.TemporaryDirectory()

    def tearDown(self) -> None:
        self.data_dir.cleanup()
        self.fleet_log_dir.cleanup()

    def multi_device_run(self, layout, fetch_list, only_forward=False):
        cmds = [
123
            sys.executable,
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
            "-m",
            "paddle.distributed.launch",
        ]
        cmds += ["--log_dir", self.fleet_log_dir.name]
        cmds += ["dist_fleet_sync_batch_norm.py"]
        cmds += ["--data_dir", self.data_dir.name]

        dshape = [
            self.N // core.get_cuda_device_count(),
            self.C,
            self.H,
            self.W,
        ]
        cmds += ["--dshape", str(dshape)]
        cmds += ["--dtype", str(self.dtype.__name__)]
        cmds += ["--layout", layout]
        cmds += ["--fetch_list", str(fetch_list)]
        if only_forward:
            cmds += ["--only_forward"]
143
        if self.dtype == np.float16 or self.dtype == np.uint16:
144
            cmds += ["--use_cudnn"]
145 146
        cmd = concat_cmd(cmds)
        assert os.system(cmd) == 0, cmd
Q
qingqing01 已提交
147

148 149 150
    def _build_program(
        self, place, layout, seed, sync_bn=False, only_forward=False
    ):
151
        """Build program."""
Q
qingqing01 已提交
152 153 154 155
        main = fluid.Program()
        startup = fluid.Program()
        main.random_seed = seed
        startup.random_seed = seed
156
        use_cudnn = (self.dtype == np.float16) or (self.dtype == np.uint16)
Q
qingqing01 已提交
157 158
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
G
GGBond8488 已提交
159
                data = paddle.static.data(
160 161 162 163
                    name='input',
                    shape=self.dshape,
                    dtype=self.dtype,
                )
G
GGBond8488 已提交
164
                data.desc.set_need_check_feed(False)
165
                conv = paddle.static.nn.conv2d(
Q
qingqing01 已提交
166 167 168 169 170
                    input=data,
                    num_filters=32,
                    filter_size=1,
                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
                    bias_attr=False,
171 172
                    use_cudnn=use_cudnn,
                )
173
                bn = paddle.static.nn.batch_norm(
Q
qingqing01 已提交
174 175 176 177 178 179
                    conv,
                    param_attr=fluid.ParamAttr(name='bn_scale'),
                    bias_attr=fluid.ParamAttr(name='bn_bias'),
                    moving_mean_name='bn_moving_mean',
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
180 181
                    is_test=only_forward,
                )
182
                if core.is_compiled_with_rocm():
183
                    bn = paddle.cast(bn, 'float32')
184
                else:
185
                    bn = paddle.cast(bn, 'float64')
186
                sigmoid = paddle.nn.functional.sigmoid(bn)
187
                out = paddle.sum(sigmoid)
Q
qingqing01 已提交
188 189 190 191 192 193 194
                if not sync_bn:
                    out = out / core.get_cuda_device_count()
                if not only_forward:
                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
                    sgd_opt.backward(out)
        return main, startup, [out, conv, bn]

195
    @prog_scope()
196
    def _compare(self, place, layout, only_forward):
197 198 199 200 201 202 203 204
        try:
            with paddle.utils.unique_name.guard():
                self._compare_impl(place, layout, only_forward)
        finally:
            clean_dir(self.data_dir)
            clean_dir(self.fleet_log_dir)

    def _compare_impl(self, place, layout, only_forward):
205
        """Compare results."""
Q
qingqing01 已提交
206 207
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
208
        paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
209
        paddle.enable_static()
210
        scope = core.Scope()
211 212 213 214 215 216 217 218
        if self.dtype == np.uint16:
            data = convert_float_to_uint16(
                np.random.random(size=self.dshape).astype(np.float32) * 4.0 - 2
            )
        else:
            data = (
                np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2
            )
219 220 221 222 223 224 225 226 227
        stride = self.N // core.get_cuda_device_count()
        for id in range(core.get_cuda_device_count()):
            filepath = os.path.join(
                self.data_dir.name,
                'input_{}_{}_{}_{}.npy'.format(
                    id, only_forward, str(self.dtype.__name__), layout
                ),
            )
            np.save(filepath, data[id * stride : (id + 1) * stride])
228 229 230
        data = create_or_get_tensor(
            scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place
        )
231

Q
qingqing01 已提交
232
        # Single-GPU, N = 32 per GPU
233 234 235
        main, startup, outs = self._build_program(
            place, layout, seed, False, only_forward
        )
Q
qingqing01 已提交
236 237 238
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
239 240 241 242
            'bn_moving_mean',
            'bn_moving_variance',
            'bn_scale',
            'bn_bias',
Q
qingqing01 已提交
243 244 245
        ]
        if not only_forward:
            others = [
246 247 248 249 250 251
                'batch_norm_0.tmp_0',
                'batch_norm_0.tmp_1',
                'bn_scale@GRAD',
                'bn_bias@GRAD',
                'batch_norm_0.tmp_3@GRAD',
                'conv2d_0.tmp_0@GRAD',
Q
qingqing01 已提交
252 253
            ]
            fetch_names += others
254 255 256
        bn_fetches = exe.run(
            program=main, feed={'input': data}, fetch_list=fetch_names
        )
Q
qingqing01 已提交
257 258 259

        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
260
        assert core.get_cuda_device_count() > 1
261 262

        fetch_names = [
263 264 265 266
            'bn_moving_mean',
            'bn_moving_variance',
            'bn_scale',
            'bn_bias',
Q
qingqing01 已提交
267 268 269
        ]
        if not only_forward:
            others = [
270 271 272 273 274 275
                'batch_norm_0.tmp_0',
                'batch_norm_0.tmp_1',
                'bn_scale@GRAD',
                'bn_bias@GRAD',
                'batch_norm_0.tmp_3@GRAD',
                'conv2d_0.tmp_0@GRAD',
Q
qingqing01 已提交
276 277
            ]
            fetch_names += others
278 279 280

        self.multi_device_run(
            layout, fetch_list=fetch_names, only_forward=only_forward
281
        )
Q
qingqing01 已提交
282

283 284 285
        fetch_names = [v.name for v in outs] + fetch_names

        for i in range(1, len(bn_fetches)):
Q
qingqing01 已提交
286
            bn_val = bn_fetches[i]
287 288 289 290 291 292 293
            file_path = os.path.join(
                self.data_dir.name,
                'output_{}_{}_{}_{}.npy'.format(
                    0, only_forward, self.dtype.__name__, i
                ),
            )
            sync_bn_val = np.load(file_path)
Q
qingqing01 已提交
294
            if sync_bn_val.shape != bn_val.shape:
295
                bn_val = bn_val[:stride]
296
            np.testing.assert_allclose(
297 298
                convert_numpy_array(bn_val),
                convert_numpy_array(sync_bn_val),
299 300 301 302 303 304 305 306 307 308 309
                rtol=1e-05,
                atol=self.atol,
                err_msg='Output ('
                + fetch_names[i]
                + ') has diff. \n'
                + '\nBN     '
                + str(bn_val)
                + '\n'
                + 'Sync BN '
                + str(sync_bn_val),
            )
Q
qingqing01 已提交
310 311

    def test_train(self):
312
        """Test training."""
Q
qingqing01 已提交
313 314 315 316 317
        if not core.is_compiled_with_cuda():
            return

        places = [core.CUDAPlace(0)]
        for place in places:
318
            for layout in ["NHWC", "NCHW"]:
319
                self._compare(place, layout, False)
Q
qingqing01 已提交
320 321

    def test_infer(self):
322
        """Test inference."""
Q
qingqing01 已提交
323 324 325 326 327
        if not core.is_compiled_with_cuda():
            return

        places = [core.CUDAPlace(0)]
        for place in places:
328
            for layout in ["NHWC", "NCHW"]:
329 330 331 332 333 334 335 336 337
                self._compare(place, layout, True)


class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
    """sync_batch_norm op test for FP16 input."""

    def setUp(self):
        """Setup."""
        self.dtype = np.float16
338
        self.N = 8
339
        self.C = 16
340
        self.H = 32
341
        self.W = 32
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
        self.dshape = [self.N, self.C, self.H, self.W]
        self.atol = 1e-3
        self.data_dir = tempfile.TemporaryDirectory()
        self.fleet_log_dir = tempfile.TemporaryDirectory()


@unittest.skipIf(
    not core.is_compiled_with_cuda()
    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
    "core is not compiled with CUDA or not support the bfloat16",
)
class TestBF16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
    """sync_batch_norm op test for BF16 input."""

    def setUp(self):
        """Setup."""
        self.dtype = np.uint16
        self.N = 8
        self.C = 16
        self.H = 32
        self.W = 32
363 364
        self.dshape = [self.N, self.C, self.H, self.W]
        self.atol = 1e-2
365 366
        self.data_dir = tempfile.TemporaryDirectory()
        self.fleet_log_dir = tempfile.TemporaryDirectory()
Q
qingqing01 已提交
367 368


C
ceci3 已提交
369 370 371 372 373
class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
    def test_errors(self):
        if not core.is_compiled_with_cuda():
            return

374
        cleanup = enable_static()
C
ceci3 已提交
375
        with program_guard(Program(), Program()):
C
ceci3 已提交
376
            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
377 378 379
            x1 = fluid.create_lod_tensor(
                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0)
            )
C
ceci3 已提交
380 381 382 383
            self.assertRaises(TypeError, my_sync_batch_norm, x1)

            # the input dtype of SyncBatchNorm must be float16 or float32 or float64
            # float16 only can be set on GPU place
G
GGBond8488 已提交
384 385 386 387
            x2 = paddle.static.data(
                name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
            )
            x2.desc.set_need_check_feed(False)
C
ceci3 已提交
388
            self.assertRaises(TypeError, my_sync_batch_norm, x2)
389
        cleanup()
C
ceci3 已提交
390 391


392 393 394 395 396 397
class TestConvertSyncBatchNorm(unittest.TestCase):
    def test_convert(self):
        if not core.is_compiled_with_cuda():
            return

        with program_guard(Program(), Program()):
398 399 400 401 402
            compare_model = paddle.nn.Sequential(
                paddle.nn.Conv2D(3, 5, 3),
                paddle.nn.BatchNorm2D(5),
                paddle.nn.BatchNorm2D(5),
            )
403
            model = paddle.nn.Sequential(
404 405
                paddle.nn.Conv2D(3, 5, 3),
                paddle.nn.BatchNorm2D(5),
C
ceci3 已提交
406 407 408
                paddle.nn.BatchNorm2D(
                    5,
                    weight_attr=fluid.ParamAttr(name='bn.scale'),
409 410 411
                    bias_attr=fluid.ParamAttr(name='bn.bias'),
                ),
            )
412 413
            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
            for idx, sublayer in enumerate(compare_model.sublayers()):
C
cnn 已提交
414
                if isinstance(sublayer, paddle.nn.BatchNorm2D):
415
                    self.assertEqual(
416 417
                        isinstance(model[idx], paddle.nn.SyncBatchNorm), True
                    )
418 419


C
ceci3 已提交
420
class TestConvertSyncBatchNormCast1(unittest.TestCase):
C
ceci3 已提交
421 422 423 424 425 426
    def test_convert(self):
        if not core.is_compiled_with_cuda():
            return

        class Net(nn.Layer):
            def __init__(self):
427
                super().__init__()
C
ceci3 已提交
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
                self.conv1 = nn.Conv2D(3, 5, 3)
                self.bn = []
                bn = self.add_sublayer('bn', nn.BatchNorm2D(5))
                self.bn.append(bn)

            def forward(self, x):
                x = self.conv1(x)
                for bn in self.bn:
                    x = bn(x)
                return x

        model = nn.Sequential()
        model.add_sublayer('net1', Net())
        model.add_sublayer('net2', Net())
        compare_model = nn.Sequential()
        compare_model.add_sublayer('net1', Net())
        compare_model.add_sublayer('net2', Net())
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        self.assertEqual(len(compare_model.sublayers()), len(model.sublayers()))


C
ceci3 已提交
449 450 451 452 453 454 455 456 457 458 459 460
class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase):
    def test_errors(self):
        if not core.is_compiled_with_cuda():
            return

        with fluid.dygraph.guard(fluid.CUDAPlace(0)):
            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10, data_format='CN')
            data = np.random.random([3, 3, 3]).astype('float32')
            x = paddle.to_tensor(data)
            self.assertRaises(ValueError, my_sync_batch_norm, x)


Q
qingqing01 已提交
461
if __name__ == '__main__':
462 463 464
    paddle.seed(0)
    np.random.seed(0)
    random.seed(0)
Q
qingqing01 已提交
465
    unittest.main()