decorator.py 23.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from ... import core
16 17
from ... import default_main_program
from ... import default_startup_program
18
from ... import framework
19
from ... import layers
20
from ... import program_guard
21
from ... import unique_name
22
from . import fp16_utils
23
from .fp16_utils import rewrite_program
24 25
from .fp16_utils import cast_model_to_fp16
from .fp16_utils import cast_parameters_to_fp16
26
from .fp16_utils import update_role_var_grad
J
Jie Fang 已提交
27
from .fp16_lists import AutoMixedPrecisionLists
28 29
from .amp_nn import check_finite_and_unscale
from .amp_nn import update_loss_scaling
30 31
import types
import warnings
32 33 34 35

__all__ = ["decorate"]


Z
Zhen Wang 已提交
36
class OptimizerWithMixedPrecision(object):
37 38
    """
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
Z
Zhen Wang 已提交
39
    optimizer, plus the support of mixed-precision pre-training. The object
40 41 42 43 44 45 46
    of this class almost has the same behavior as the common optimizer, with the 
    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
    Additionally, it enables the MP training automatically, i.e, the creation 
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
47
        amp_lists (CustomOpLists): An CustomOpLists object.
48 49
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
J
Jie Fang 已提交
50 51 52 53 54 55 56 57 58
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
59 60 61
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
62

63 64
    """

J
Jie Fang 已提交
65 66
    def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 use_dynamic_loss_scaling, incr_every_n_steps,
67 68
                 decr_every_n_nan_or_inf, incr_ratio, decr_ratio, use_pure_fp16,
                 use_fp16_guard):
69
        self._optimizer = optimizer
J
Jie Fang 已提交
70
        self._amp_lists = amp_lists
71
        self._param_grads = None
72 73
        self._train_program = None

74
        self._is_distributed = False
75
        self._scaled_loss = None
76 77
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
78
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
79 80
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
81 82 83
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
84
        if self._use_dynamic_loss_scaling:
85 86
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
87 88
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
89 90 91
            self._num_good_steps = None
            self._num_bad_steps = None

92 93 94 95 96 97
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

98 99 100
    def get_loss_scaling(self):
        """Return the real-time loss scaling factor.
        """
101
        assert self._loss_scaling is not None, 'Call minimize() before calling get_loss_scaling()'
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

    def _init_amp_var(self):
        self._loss_scaling = layers.create_global_var(
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
            persistable=True)

        if self._use_dynamic_loss_scaling:
J
Jie Fang 已提交
119 120 121 122 123 124 125 126 127 128 129 130
            self._num_good_steps = layers.create_global_var(
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
            self._num_bad_steps = layers.create_global_var(
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
131

132
        # Ensure the data type of learning rate vars is float32 (same as the
133
        # master parameter dtype)
134 135 136 137 138 139 140 141
        if isinstance(self._optimizer._learning_rate, float):
            self._optimizer._learning_rate_map[default_main_program()] = \
                    layers.create_global_var(
                    name=unique_name.generate("learning_rate"),
                    shape=[1],
                    value=float(self._optimizer._learning_rate),
                    dtype='float32',
                    persistable=True)
142

143 144 145 146 147 148 149
    def backward(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 callbacks=None):
        """
Z
Zhen Wang 已提交
150
        Backward propagation or auto differentiation for gradients' computation.
151 152 153 154 155 156 157

        Args:
            loss (Variable): The loss Variable to minimize.
            startup_program (Program|None): The startup Program for initializing 
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
158
            callbacks (list|None): A list of callable objects to run when appending
159 160 161 162 163 164
                                   backward operator for one parameter.

        Returns:
            A list of (param, grad), which is a tuple of a parameter and its 
            gradient respectively, and the scaled loss.
        """
165 166 167
        train_program = loss.block.program
        self._train_program = train_program

168
        with program_guard(self._train_program, startup_program):
169 170
            self._init_amp_var()

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
                    self._train_program, self._amp_lists, self._use_fp16_guard)
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

186 187 188 189
            params_grads = self._optimizer.backward(
                self._scaled_loss, startup_program, parameter_list, no_grad_set,
                callbacks)
        return params_grads
190

191 192 193 194 195 196 197 198 199
    def amp_init(self,
                 place,
                 scope=None,
                 test_program=None,
                 use_fp16_test=False):
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
  
        Args:
H
huangxu96 已提交
200
            place(CUDAPlace): place is used to initialize 
201 202 203 204 205
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
                    # or the slow convergence in a way. 
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
                    
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                    run_example_code()       
250 251 252 253 254 255 256 257 258 259 260 261 262
        """
        assert self._train_program is not None, \
            "Please call the minimize method first."
        if self._use_pure_fp16:
            cast_parameters_to_fp16(place, self._train_program, scope,
                                    self._to_fp16_var_names)
        if test_program is not None:
            if self._use_pure_fp16:
                cast_model_to_fp16(test_program, self._amp_lists,
                                   self._use_fp16_guard)
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

263
    def apply_gradients(self, params_grads):
264
        """
265
        Check scaled gradients to determine whether to update loss scaling and update 
266
        parameters by their scaled gradients.
267 268
  
        Args:
269
            params_grads (list): A list of params and scaled grads.
270 271 272 273
    
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
274

275 276 277 278
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

279 280 281 282 283
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
        if not self._use_dynamic_loss_scaling and self._init_loss_scaling == 1.0:
            return self._optimizer.apply_gradients(params_grads)

284
        grads = [g for _, g in params_grads]
285 286 287 288 289 290 291
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
        assert len(fp32_grads) + len(fp16_grads) == len(grads), \
            "Data types of all grads must be either fp16 or fp32."

        found_infs = []
        if self._is_distributed:
292 293 294 295 296 297 298
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
            for p, g in params_grads:
                with self._train_program._optimized_guard([p, g]):
                    _, found_inf = check_finite_and_unscale(
                        [g, ], self._loss_scaling, name="find_infinite_scale")
                    found_infs.append(found_inf)
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp32")
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp16")
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
                    grads, self._loss_scaling, name="find_infinite_scale")
J
Jie Fang 已提交
318

319
        if self._use_dynamic_loss_scaling:
320
            if self._is_distributed or self._use_pure_fp16:
321 322 323 324
                with self._train_program._optimized_guard([]):
                    all_infs = layers.concat(found_infs)
                    found_inf = layers.reduce_any(all_infs)

325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
            if self._use_pure_fp16:
                stop_update = False
                with self._train_program._optimized_guard([]):
                    if fp32_grads:
                        update_loss_scaling(
                            fp32_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp32")
                        stop_update = True
                    if fp16_grads:
                        update_loss_scaling(
                            fp16_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp16")
            else:
                with self._train_program._optimized_guard([]):
                    update_loss_scaling(
                        grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        name="update_loss_scaling")
368

369
        optimize_ops = self._optimizer.apply_gradients(params_grads)
370 371
        return optimize_ops

372 373 374 375 376 377
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

G
gongweibao 已提交
378 379 380 381 382
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
383 384 385 386 387
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
388 389 390 391
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
392 393 394

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
395
            list of scaled parameters and gradients.
396
        """
397 398 399 400 401 402 403
        opt_dict = self._optimizer.__class__.__dict__
        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                 types.FunctionType):
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

404
        scaled_params_grads = self.backward(
G
gongweibao 已提交
405 406 407 408 409
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

410 411
        optimize_ops = self.apply_optimize(loss, startup_program,
                                           scaled_params_grads)
412

G
gongweibao 已提交
413
        return optimize_ops, scaled_params_grads
414 415


J
Jie Fang 已提交
416
def decorate(optimizer,
J
Jie Fang 已提交
417
             amp_lists=None,
418
             init_loss_scaling=2**15,
J
Jie Fang 已提交
419 420 421 422
             incr_every_n_steps=1000,
             decr_every_n_nan_or_inf=2,
             incr_ratio=2.0,
             decr_ratio=0.8,
423 424 425
             use_dynamic_loss_scaling=True,
             use_pure_fp16=False,
             use_fp16_guard=None):
426 427 428 429 430
    """ 
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
431
        amp_lists (CustomOpLists): An CustomOpLists object.
432
        init_loss_scaling(float): The initial loss scaling factor.
J
Jie Fang 已提交
433 434 435 436 437 438 439 440 441
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
442
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
443 444 445
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
446 447 448 449 450

    Returns:
        An optimizer acting like a normal one but with mixed-precision training 
        enabled.

H
huangxu96 已提交
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
    Examples 1:
	    .. code-block:: python

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
467

G
gongweibao 已提交
468
            ops, param_grads = mp_optimizer.minimize(loss)
469
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
                # or the slow convergence in a way. 
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
                
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
515
    """
J
Jie Fang 已提交
516 517
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
518 519 520 521

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
522
    mp_optimizer = OptimizerWithMixedPrecision(
J
Jie Fang 已提交
523
        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
524 525
        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
        use_pure_fp16, use_fp16_guard)
526 527

    return mp_optimizer