decorator.py 17.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from ... import core
16 17
from ... import default_main_program
from ... import default_startup_program
18
from ... import framework
19
from ... import layers
20
from ... import program_guard
21
from ... import unique_name
22
from . import fp16_utils
23
from .fp16_utils import rewrite_program
24 25
from .fp16_utils import cast_model_to_fp16
from .fp16_utils import cast_parameters_to_fp16
26
from .fp16_utils import update_role_var_grad
J
Jie Fang 已提交
27
from .fp16_lists import AutoMixedPrecisionLists
28 29
from .amp_nn import check_finite_and_unscale
from .amp_nn import update_loss_scaling
30 31
import types
import warnings
32 33 34 35

__all__ = ["decorate"]


Z
Zhen Wang 已提交
36
class OptimizerWithMixedPrecision(object):
37 38
    """
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
Z
Zhen Wang 已提交
39
    optimizer, plus the support of mixed-precision pre-training. The object
40 41 42 43 44 45 46
    of this class almost has the same behavior as the common optimizer, with the 
    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
    Additionally, it enables the MP training automatically, i.e, the creation 
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
J
Jie Fang 已提交
47
        amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object.
48 49
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
J
Jie Fang 已提交
50 51 52 53 54 55 56 57 58
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
59 60 61
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
62

63 64
    """

J
Jie Fang 已提交
65 66
    def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 use_dynamic_loss_scaling, incr_every_n_steps,
67 68
                 decr_every_n_nan_or_inf, incr_ratio, decr_ratio, use_pure_fp16,
                 use_fp16_guard):
69
        self._optimizer = optimizer
J
Jie Fang 已提交
70
        self._amp_lists = amp_lists
71
        self._param_grads = None
72 73
        self._train_program = None

74
        self._is_distributed = False
75
        self._scaled_loss = None
76 77
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
78
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
79 80
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
81 82 83
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
84
        if self._use_dynamic_loss_scaling:
85 86
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
87 88
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
89 90 91
            self._num_good_steps = None
            self._num_bad_steps = None

92 93 94 95 96 97
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
    def get_loss_scaling(self):
        """Return the real-time loss scaling factor.
        """
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

    def _init_amp_var(self):
        self._loss_scaling = layers.create_global_var(
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
            persistable=True)

        if self._use_dynamic_loss_scaling:
J
Jie Fang 已提交
118 119 120 121 122 123 124 125 126 127 128 129
            self._num_good_steps = layers.create_global_var(
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
            self._num_bad_steps = layers.create_global_var(
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
130

131
        # Ensure the data type of learning rate vars is float32 (same as the
132
        # master parameter dtype)
133 134 135 136 137 138 139 140
        if isinstance(self._optimizer._learning_rate, float):
            self._optimizer._learning_rate_map[default_main_program()] = \
                    layers.create_global_var(
                    name=unique_name.generate("learning_rate"),
                    shape=[1],
                    value=float(self._optimizer._learning_rate),
                    dtype='float32',
                    persistable=True)
141

142 143 144 145 146 147 148
    def backward(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 callbacks=None):
        """
Z
Zhen Wang 已提交
149
        Backward propagation or auto differentiation for gradients' computation.
150 151 152 153 154 155 156

        Args:
            loss (Variable): The loss Variable to minimize.
            startup_program (Program|None): The startup Program for initializing 
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
157
            callbacks (list|None): A list of callable objects to run when appending
158 159 160 161 162 163
                                   backward operator for one parameter.

        Returns:
            A list of (param, grad), which is a tuple of a parameter and its 
            gradient respectively, and the scaled loss.
        """
164 165 166
        train_program = loss.block.program
        self._train_program = train_program

167
        with program_guard(self._train_program, startup_program):
168 169
            self._init_amp_var()

170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
                    self._train_program, self._amp_lists, self._use_fp16_guard)
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

185 186 187 188
            params_grads = self._optimizer.backward(
                self._scaled_loss, startup_program, parameter_list, no_grad_set,
                callbacks)
        return params_grads
189

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
    def amp_init(self,
                 place,
                 scope=None,
                 test_program=None,
                 use_fp16_test=False):
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
  
        Args:
            place(CPUPlace|CUDAPlace): place is used to initialize 
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

        """
        assert self._train_program is not None, \
            "Please call the minimize method first."
        if self._use_pure_fp16:
            cast_parameters_to_fp16(place, self._train_program, scope,
                                    self._to_fp16_var_names)
        if test_program is not None:
            if self._use_pure_fp16:
                cast_model_to_fp16(test_program, self._amp_lists,
                                   self._use_fp16_guard)
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

218
    def apply_gradients(self, params_grads):
219
        """
220
        Check scaled gradients to determine whether to update loss scaling and update 
221
        parameters by their scaled gradients.
222 223
  
        Args:
224
            params_grads (list): A list of params and scaled grads.
225 226 227 228
    
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
229

230 231 232 233
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

234 235 236 237 238
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
        if not self._use_dynamic_loss_scaling and self._init_loss_scaling == 1.0:
            return self._optimizer.apply_gradients(params_grads)

239
        grads = [g for _, g in params_grads]
240 241 242 243 244 245 246
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
        assert len(fp32_grads) + len(fp16_grads) == len(grads), \
            "Data types of all grads must be either fp16 or fp32."

        found_infs = []
        if self._is_distributed:
247 248 249 250 251 252 253
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
            for p, g in params_grads:
                with self._train_program._optimized_guard([p, g]):
                    _, found_inf = check_finite_and_unscale(
                        [g, ], self._loss_scaling, name="find_infinite_scale")
                    found_infs.append(found_inf)
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp32")
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
                        name="find_infinite_scale_fp16")
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
                    grads, self._loss_scaling, name="find_infinite_scale")
J
Jie Fang 已提交
273

274
        if self._use_dynamic_loss_scaling:
275
            if self._is_distributed or self._use_pure_fp16:
276 277 278 279
                with self._train_program._optimized_guard([]):
                    all_infs = layers.concat(found_infs)
                    found_inf = layers.reduce_any(all_infs)

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
            if self._use_pure_fp16:
                stop_update = False
                with self._train_program._optimized_guard([]):
                    if fp32_grads:
                        update_loss_scaling(
                            fp32_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp32")
                        stop_update = True
                    if fp16_grads:
                        update_loss_scaling(
                            fp16_grads,
                            found_inf,
                            self._loss_scaling,
                            self._num_good_steps,
                            self._num_bad_steps,
                            self._incr_every_n_steps,
                            self._decr_every_n_nan_or_inf,
                            self._incr_ratio,
                            self._decr_ratio,
                            stop_update=stop_update,
                            name="update_loss_scaling_fp16")
            else:
                with self._train_program._optimized_guard([]):
                    update_loss_scaling(
                        grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        name="update_loss_scaling")
323

324
        optimize_ops = self._optimizer.apply_gradients(params_grads)
325 326
        return optimize_ops

327 328 329 330 331 332
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

G
gongweibao 已提交
333 334 335 336 337
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
338 339 340 341 342
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
343 344 345 346
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
347 348 349

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
350
            list of scaled parameters and gradients.
351
        """
352 353 354 355 356 357 358
        opt_dict = self._optimizer.__class__.__dict__
        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                 types.FunctionType):
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

359
        scaled_params_grads = self.backward(
G
gongweibao 已提交
360 361 362 363 364
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

365 366
        optimize_ops = self.apply_optimize(loss, startup_program,
                                           scaled_params_grads)
367

G
gongweibao 已提交
368
        return optimize_ops, scaled_params_grads
369 370


J
Jie Fang 已提交
371
def decorate(optimizer,
J
Jie Fang 已提交
372
             amp_lists=None,
373
             init_loss_scaling=2**15,
J
Jie Fang 已提交
374 375 376 377
             incr_every_n_steps=1000,
             decr_every_n_nan_or_inf=2,
             incr_ratio=2.0,
             decr_ratio=0.8,
378 379 380
             use_dynamic_loss_scaling=True,
             use_pure_fp16=False,
             use_fp16_guard=None):
381 382 383 384 385
    """ 
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
J
Jie Fang 已提交
386
        amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object.
387
        init_loss_scaling(float): The initial loss scaling factor.
J
Jie Fang 已提交
388 389 390 391 392 393 394 395 396
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
397
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
398 399 400
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
401 402 403 404 405 406 407 408 409 410 411 412 413 414

    Returns:
        An optimizer acting like a normal one but with mixed-precision training 
        enabled.

    Examples:
	.. code-block:: python

	    loss = network()
            optimizer = fluid.optimizer.Adam(learning_rate=0.001)
	
            mp_optimizer = fluid.contrib.mixed_precision.decorate(
	              optimizer=optimizer, init_loss_scaling=8.0)
	
G
gongweibao 已提交
415
            ops, param_grads = mp_optimizer.minimize(loss)
416
            scaled_loss = mp_optimizer.get_scaled_loss()
417
    """
J
Jie Fang 已提交
418 419
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
420 421 422 423

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
424
    mp_optimizer = OptimizerWithMixedPrecision(
J
Jie Fang 已提交
425
        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
426 427
        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
        use_pure_fp16, use_fp16_guard)
428 429

    return mp_optimizer