decorator.py 23.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from ... import core
16 17
from ... import default_main_program
from ... import default_startup_program
18
from ... import framework
19
from ... import layers
20
from ... import program_guard
21
from ... import unique_name
22
from . import fp16_utils
23
from .fp16_utils import rewrite_program
24 25
from .fp16_utils import cast_model_to_fp16
from .fp16_utils import cast_parameters_to_fp16
26
from .fp16_utils import update_role_var_grad
J
Jie Fang 已提交
27
from .fp16_lists import AutoMixedPrecisionLists
28 29
from .amp_nn import check_finite_and_unscale
from .amp_nn import update_loss_scaling
30 31
import types
import warnings
32 33 34 35

__all__ = ["decorate"]


Z
Zhen Wang 已提交
36
class OptimizerWithMixedPrecision(object):
37 38
    """
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
Z
Zhen Wang 已提交
39
    optimizer, plus the support of mixed-precision pre-training. The object
40 41 42 43 44 45 46
    of this class almost has the same behavior as the common optimizer, with the 
    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
    Additionally, it enables the MP training automatically, i.e, the creation 
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
47
        amp_lists (CustomOpLists): An CustomOpLists object.
48 49
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
J
Jie Fang 已提交
50 51 52 53 54 55 56 57 58
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
59 60 61
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
62

63 64
    """

J
Jie Fang 已提交
65 66
    def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 use_dynamic_loss_scaling, incr_every_n_steps,
67 68
                 decr_every_n_nan_or_inf, incr_ratio, decr_ratio, use_pure_fp16,
                 use_fp16_guard):
69
        self._optimizer = optimizer
J
Jie Fang 已提交
70
        self._amp_lists = amp_lists
71
        self._param_grads = None
72 73
        self._train_program = None

74
        self._is_distributed = False
75
        self._scaled_loss = None
76 77
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
78
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
79 80
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
81 82 83
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
84
        if self._use_dynamic_loss_scaling:
85 86
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
87 88
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
89 90 91
            self._num_good_steps = None
            self._num_bad_steps = None

92 93 94 95 96 97
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
    def get_loss_scaling(self):
        """Return the real-time loss scaling factor.
        """
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

    def _init_amp_var(self):
        self._loss_scaling = layers.create_global_var(
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
            persistable=True)

        if self._use_dynamic_loss_scaling:
J
Jie Fang 已提交
118 119 120 121 122 123 124 125 126 127 128 129
            self._num_good_steps = layers.create_global_var(
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
            self._num_bad_steps = layers.create_global_var(
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
130

131
        # Ensure the data type of learning rate vars is float32 (same as the
132
        # master parameter dtype)
133 134 135 136 137 138 139 140
        if isinstance(self._optimizer._learning_rate, float):
            self._optimizer._learning_rate_map[default_main_program()] = \
                    layers.create_global_var(
                    name=unique_name.generate("learning_rate"),
                    shape=[1],
                    value=float(self._optimizer._learning_rate),
                    dtype='float32',
                    persistable=True)
141

142 143 144 145 146 147 148
    def backward(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 callbacks=None):
        """
Z
Zhen Wang 已提交
149
        Backward propagation or auto differentiation for gradients' computation.
150 151 152 153 154 155 156

        Args:
            loss (Variable): The loss Variable to minimize.
            startup_program (Program|None): The startup Program for initializing 
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
157
            callbacks (list|None): A list of callable objects to run when appending
158 159 160 161 162 163
                                   backward operator for one parameter.

        Returns:
            A list of (param, grad), which is a tuple of a parameter and its 
            gradient respectively, and the scaled loss.
        """
164 165 166
        train_program = loss.block.program
        self._train_program = train_program

167
        with program_guard(self._train_program, startup_program):
168 169
            self._init_amp_var()

170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
                    self._train_program, self._amp_lists, self._use_fp16_guard)
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

185 186 187 188
            params_grads = self._optimizer.backward(
                self._scaled_loss, startup_program, parameter_list, no_grad_set,
                callbacks)
        return params_grads
189

190 191 192 193 194 195 196 197 198
    def amp_init(self,
                 place,
                 scope=None,
                 test_program=None,
                 use_fp16_test=False):
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
  
        Args:
H
huangxu96 已提交
199
            place(CUDAPlace): place is used to initialize 
200 201 202 203 204
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
                    # or the slow convergence in a way. 
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
                    
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                    run_example_code()       
249 250 251 252 253 254 255 256 257 258 259 260 261
        """
        assert self._train_program is not None, \
            "Please call the minimize method first."
        if self._use_pure_fp16:
            cast_parameters_to_fp16(place, self._train_program, scope,
                                    self._to_fp16_var_names)
        if test_program is not None:
            if self._use_pure_fp16:
                cast_model_to_fp16(test_program, self._amp_lists,
                                   self._use_fp16_guard)
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

262
    def apply_gradients(self, params_grads):
263
        """
264
        Check scaled gradients to determine whether to update loss scaling and update 
265
        parameters by their scaled gradients.
266 267
  
        Args:
268
            params_grads (list): A list of params and scaled grads.
269 270 271 272
    
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
273

274 275 276 277
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

278 279 280 281 282
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
        if not self._use_dynamic_loss_scaling and self._init_loss_scaling == 1.0:
            return self._optimizer.apply_gradients(params_grads)

283
        grads = [g for _, g in params_grads]
284 285 286 287 288 289 290
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
        assert len(fp32_grads) + len(fp16_grads) == len(grads), \
            "Data types of all grads must be either fp16 or fp32."

        found_infs = []
        if self._is_distributed:
291 292 293 294 295 296 297
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
            for p, g in params_grads:
                with self._train_program._optimized_guard([p, g]):
                    _, found_inf = check_finite_and_unscale(
                        [g, ], self._loss_scaling, name="find_infinite_scale")
                    found_infs.append(found_inf)
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp32")
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp16")
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
                    grads, self._loss_scaling, name="find_infinite_scale")
J
Jie Fang 已提交
317

318
        if self._use_dynamic_loss_scaling:
319
            if self._is_distributed or self._use_pure_fp16:
320 321 322 323
                with self._train_program._optimized_guard([]):
                    all_infs = layers.concat(found_infs)
                    found_inf = layers.reduce_any(all_infs)

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
            if self._use_pure_fp16:
                stop_update = False
                with self._train_program._optimized_guard([]):
                    if fp32_grads:
                        update_loss_scaling(
                            fp32_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp32")
                        stop_update = True
                    if fp16_grads:
                        update_loss_scaling(
                            fp16_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp16")
            else:
                with self._train_program._optimized_guard([]):
                    update_loss_scaling(
                        grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        name="update_loss_scaling")
367

368
        optimize_ops = self._optimizer.apply_gradients(params_grads)
369 370
        return optimize_ops

371 372 373 374 375 376
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

G
gongweibao 已提交
377 378 379 380 381
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
382 383 384 385 386
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
387 388 389 390
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
391 392 393

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
394
            list of scaled parameters and gradients.
395
        """
396 397 398 399 400 401 402
        opt_dict = self._optimizer.__class__.__dict__
        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                 types.FunctionType):
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

403
        scaled_params_grads = self.backward(
G
gongweibao 已提交
404 405 406 407 408
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

409 410
        optimize_ops = self.apply_optimize(loss, startup_program,
                                           scaled_params_grads)
411

G
gongweibao 已提交
412
        return optimize_ops, scaled_params_grads
413 414


J
Jie Fang 已提交
415
def decorate(optimizer,
J
Jie Fang 已提交
416
             amp_lists=None,
417
             init_loss_scaling=2**15,
J
Jie Fang 已提交
418 419 420 421
             incr_every_n_steps=1000,
             decr_every_n_nan_or_inf=2,
             incr_ratio=2.0,
             decr_ratio=0.8,
422 423 424
             use_dynamic_loss_scaling=True,
             use_pure_fp16=False,
             use_fp16_guard=None):
425 426 427 428 429
    """ 
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
430
        amp_lists (CustomOpLists): An CustomOpLists object.
431
        init_loss_scaling(float): The initial loss scaling factor.
J
Jie Fang 已提交
432 433 434 435 436 437 438 439 440
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
441
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
442 443 444
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
445 446 447 448 449

    Returns:
        An optimizer acting like a normal one but with mixed-precision training 
        enabled.

H
huangxu96 已提交
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
    Examples 1:
	    .. code-block:: python

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
466

G
gongweibao 已提交
467
            ops, param_grads = mp_optimizer.minimize(loss)
468
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
                # or the slow convergence in a way. 
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
                
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
514
    """
J
Jie Fang 已提交
515 516
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
517 518 519 520

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
521
    mp_optimizer = OptimizerWithMixedPrecision(
J
Jie Fang 已提交
522
        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
523 524
        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
        use_pure_fp16, use_fp16_guard)
525 526

    return mp_optimizer