From 93535c59043a4d6b10d7d982deb928ec98be884d Mon Sep 17 00:00:00 2001 From: arlesniak Date: Thu, 29 Apr 2021 17:19:57 +0200 Subject: [PATCH] Added pure_bf16 mode (#32281) (#32681) This is cherry-pick of #32281 --- paddle/fluid/operators/assign_op.cc | 1 + .../fluid/contrib/mixed_precision/__init__.py | 3 - .../contrib/mixed_precision/bf16/__init__.py | 4 +- .../contrib/mixed_precision/bf16/amp_lists.py | 14 +- .../contrib/mixed_precision/bf16/amp_utils.py | 219 +++++++++++- .../contrib/mixed_precision/bf16/decorator.py | 318 ++++++++++++++++++ .../fluid/contrib/tests/test_bf16_utils.py | 26 +- .../contrib/tests/test_model_cast_to_bf16.py | 36 +- python/paddle/fluid/layers/nn.py | 3 +- python/paddle/fluid/layers/tensor.py | 7 +- .../fluid/tests/book/test_fit_a_line.py | 78 +++-- .../fluid/tests/book/test_word2vec_book.py | 39 ++- .../tests/unittests/test_optimizer_grad.py | 32 +- python/paddle/static/amp/__init__.py | 5 +- 14 files changed, 699 insertions(+), 86 deletions(-) create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index add533bafcb..433cabcfee0 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, ops::AssignKernel, int, ops::AssignKernel, int64_t, ops::AssignKernel, bool, ops::AssignKernel, plat::float16, + ops::AssignKernel, plat::bfloat16, ops::AssignKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py index 571b755b50d..a580ae5574c 100644 --- a/python/paddle/fluid/contrib/mixed_precision/__init__.py +++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py @@ -20,10 +20,7 @@ from . import fp16_lists from .fp16_lists import * from . import fp16_utils from .fp16_utils import * -from . import bf16 -from .bf16 import * __all__ = decorator.__all__ __all__ += fp16_lists.__all__ __all__ += fp16_utils.__all__ -__all__ += bf16.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py index 8c05bc4899c..d3632729a3b 100644 --- a/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py @@ -18,7 +18,9 @@ from . import amp_lists from .amp_lists import * from . import amp_utils from .amp_utils import * +from . import decorator +from .decorator import * -__all__ = [] +__all__ = decorator.__all__ __all__ += amp_lists.__all__ __all__ += amp_utils.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py index 81dc32d114b..1cf54aa0838 100644 --- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py @@ -13,8 +13,10 @@ # limitations under the License. import copy +from paddle.fluid import core + from ..fp16_lists import white_list as white_list_fp16, black_list as black_list_fp16,\ - gray_list as gray_list_fp16, unsupported_fp16_list + gray_list as gray_list_fp16 __all__ = ["AutoMixedPrecisionListsBF16"] @@ -82,11 +84,17 @@ bf16_list = {'elementwise_add', } # depends on the prev_op type gray_list = { + 'cast', + 'fill_constant', + 'reduce_mean', 'reshape2', - 'lookup_table', + 'scale', } -unsupported_list = unsupported_fp16_list.copy().copy() +_, _, _sys_unsupported_bf16_list = core.op_supported_infos( + 'CPU', core.VarDesc.VarType.BF16) +unsupported_list = _sys_unsupported_bf16_list + fp32_list = black_list_fp16.copy().copy() fp32_list |= white_list_fp16 fp32_list |= gray_list_fp16 diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py index c2c01f88c74..038479098a6 100644 --- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py @@ -14,18 +14,25 @@ # limitations under the License. from __future__ import print_function -import struct from .... import core from .... import framework +from .... import global_scope from ....log_helper import get_logger from ....wrapped_decorator import signature_safe_contextmanager from .amp_lists import AutoMixedPrecisionListsBF16 -from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, find_op_index +from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, \ + find_op_index, _rename_op_input + +import collections +import struct import logging import numpy as np -__all__ = ["bf16_guard", "rewrite_program_bf16", "convert_float_to_uint16"] +__all__ = [ + "bf16_guard", "rewrite_program_bf16", "cast_model_to_bf16", + "cast_parameters_to_bf16", "convert_float_to_uint16" +] _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') @@ -126,7 +133,41 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): return num_cast_ops +def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name, + op_var_rename_map): + num_cast_ops = 0 + target_var = block.var(target_name) + if target_var.type not in _valid_types or target_var.dtype == dest_dtype: + return num_cast_ops + + assert target_var.dtype == src_dtype, \ + "The real dtype({}) is not equal to the src dtype({})".format(_dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype)) + + cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype) + cast_var = block.vars.get(cast_name) + if cast_var is None or cast_var.dtype != dest_dtype: + cast_var = block.create_var( + name=cast_name, + dtype=dest_dtype, + persistable=False, + stop_gradient=target_var.stop_gradient) + block._insert_op( + idx, + type="cast", + inputs={"X": target_var}, + outputs={"Out": cast_var}, + attrs={"in_dtype": target_var.dtype, + "out_dtype": cast_var.dtype}) + num_cast_ops += 1 + op_var_rename_map[block.idx][target_var.name] = cast_var.name + + return num_cast_ops + + def _is_in_fp32_varnames(op, amp_lists): + if not amp_lists.fp32_varnames: + return False + for in_name in op.input_arg_names: if in_name in amp_lists.fp32_varnames: return True @@ -191,7 +232,174 @@ def bf16_guard(): yield -def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False): +def cast_model_to_bf16(program, amp_lists=None, use_bf16_guard=True): + """ + Traverse all ops in the whole model and set their inputs and outputs + to the bf16 data type. This function will do some special processing for + the batch normalization, which will keep the batchnorm's computations in FP32. + Args: + program (Program): The used program. + amp_lists (AutoMixedPrecisionListsBF16): An AutoMixedPrecisionListsBF16 object. + use_bf16_guard(bool): Determine whether to use `bf16_guard` when + constructing the program. Default True. + """ + + if amp_lists is None: + amp_lists = AutoMixedPrecisionListsBF16() + global_block = program.global_block() + keep_fp32_ops = set() + to_bf16_var_names = set() + to_bf16_pre_cast_ops = set() + origin_ops = [] + for block in program.blocks: + origin_ops.extend(block.ops) + + for block in program.blocks: + ops = block.ops + for op in ops: + if op.type == 'create_py_reader' or op.type == 'read': + continue + if _need_keep_fp32(op, amp_lists.unsupported_list, use_bf16_guard): + keep_fp32_ops.add(op) + continue # processed below + for in_name in op.input_names: + if op.type in { + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' + } and in_name not in {'X', 'Z'}: + continue + for in_var_name in op.input(in_name): + in_var = None + try: + in_var = block.var(in_var_name) + except ValueError as e: + _logger.debug( + "-- {}, try to get it in the global block --". + format(e)) + in_var = global_block.var(in_var_name) + if in_var is not None: + _logger.debug( + "-- var {} is got in the global block --". + format(in_var_name)) + + if in_var is None or in_var.type not in _valid_types: + continue + + if in_var.dtype == core.VarDesc.VarType.FP32: + in_var.desc.set_dtype(core.VarDesc.VarType.BF16) + to_bf16_var_names.add(in_var_name) + + _logger.debug( + "-- op type: {}, in var name: {}, in var dtype: {} --". + format(op.type, in_var_name, in_var.dtype)) + + for out_name in op.output_names: + if op.type in { + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' + } and out_name != 'Y': + continue + for out_var_name in op.output(out_name): + out_var = None + try: + out_var = block.var(out_var_name) + except ValueError as e: + _logger.debug( + "-- {}, try to get it in the global block --". + format(e)) + out_var = global_block.var(out_var_name) + if out_var is not None: + _logger.debug( + "-- var {} is got in the global block --". + format(out_var_name)) + + if out_var is None or out_var.type not in _valid_types: + continue + + if out_var.dtype == core.VarDesc.VarType.FP32: + out_var.desc.set_dtype(core.VarDesc.VarType.BF16) + + _logger.debug( + "-- op type: {}, out var name: {}, out var dtype: {} --". + format(op.type, out_var_name, out_var.dtype)) + for attr_name in ['in_dtype', 'out_dtype', 'dtype']: + if op.has_attr(attr_name) and op.attr( + attr_name) == core.VarDesc.VarType.FP32: + op._set_attr(attr_name, core.VarDesc.VarType.BF16) + if op.has_attr('use_mkldnn'): + op._set_attr('use_mkldnn', True) + if op.has_attr('mkldnn_data_type'): + op._set_attr('mkldnn_data_type', 'bfloat16') + + # process ops in keep_fp32_ops + op_var_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + for block in program.blocks: + ops = block.ops + idx = 0 + while idx < len(ops): + op = ops[idx] + num_cast_ops = 0 + if op not in keep_fp32_ops: + if op in to_bf16_pre_cast_ops: + in_var_cast_num = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.FP32, + core.VarDesc.VarType.BF16) + num_cast_ops += in_var_cast_num + else: + pre_cast_num = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.BF16, + core.VarDesc.VarType.FP32) + num_cast_ops += pre_cast_num + for out_var_name in op.output_arg_names: + out_var = block.vars.get(out_var_name) + if out_var is None or out_var.type not in _valid_types: + continue + if out_var.dtype == core.VarDesc.VarType.BF16: + out_var.desc.set_dtype(core.VarDesc.VarType.FP32) + post_ops = find_true_post_op(ops, op, out_var_name) + for post_op in post_ops: + if post_op in keep_fp32_ops: + continue + post_cast_num = _insert_cast_post_op( + block, op, idx + pre_cast_num + 1, + core.VarDesc.VarType.FP32, + core.VarDesc.VarType.BF16, out_var_name, + op_var_rename_map) + num_cast_ops += post_cast_num + idx += num_cast_ops + 1 + + _rename_op_input(program, op_var_rename_map, origin_ops, keep_fp32_ops) + return to_bf16_var_names + + +def cast_parameters_to_bf16(place, program, scope=None, to_bf16_var_names=None): + """ + Traverse all parameters in the whole model and set them to the BF16 data type. + Whereas, this function will keep parameters of batchnorms in FP32. + Args: + place(fluid.CPUPlace|fluid.CUDAPlace): `place` is used to restore the BF16 weight tensors. + program (Program): The used program. + scope(fluid.Scope, optional): `scope` is used to get the FP32 weight tensor values. + Default is None. + to_bf16_var_names(set|list, optional): The data types of vars in `to_bf16_var_names` + will be set to BF16. Usually, it is the returned + value of `cast_model_to_bf16` API. + """ + all_parameters = [] + for block in program.blocks: + all_parameters.extend(block.all_parameters()) + + bf16_var_names = to_bf16_var_names if to_bf16_var_names else set() + var_scope = scope if scope else global_scope() + for param in all_parameters: + if param.name in bf16_var_names: + _logger.debug("---- cast {} to bf16 dtype ----".format(param.name)) + param_t = var_scope.find_var(param.name).get_tensor() + data = np.array(param_t) + param_t.set(convert_float_to_uint16(data), place) + + +def rewrite_program_bf16(main_prog, amp_lists=None): """ Traverse all ops in current block and insert cast op according to which set current op belongs to. @@ -231,8 +439,7 @@ def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False): fp32_op_set.add(op) continue - if op.type in amp_lists.fp32_list or _need_keep_fp32( - op, amp_lists.unsupported_list, use_bf16_guard): + if op.type in amp_lists.fp32_list: fp32_op_set.add(op) elif op.type in amp_lists.bf16_list: bf16_op_set.add(op) diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py new file mode 100644 index 00000000000..86b5a5df75d --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py @@ -0,0 +1,318 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import (core, default_main_program, layers, program_guard, + unique_name) +from .amp_utils import (rewrite_program_bf16, cast_model_to_bf16, + cast_parameters_to_bf16) +from .amp_lists import AutoMixedPrecisionListsBF16 +import types +import warnings + +__all__ = ["decorate_bf16"] + + +class OptimizerWithMixedPrecision(object): + """ + Optimizer with mixed-precision (MP) training. This is a wrapper of a common + optimizer, plus the support of mixed-precision pre-training. The object + of this class almost has the same behavior as the common optimizer, with the + methods `minimize()`, `backward()`, `apply_gradients()` implemented. + Additionally, it enables the MP training automatically, i.e, the creation + and maintenance of master parameters, scaling of loss, etc. + + Args: + optimizer (Optimizer): A common Optimizer object. + amp_lists (CustomOpLists): An CustomOpLists object. + use_pure_bf16(bool): Whether to use the pure bf16 training. + use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program. + + """ + + def __init__(self, optimizer, amp_lists, use_pure_bf16, use_bf16_guard): + self._optimizer = optimizer + self._amp_lists = amp_lists + self._param_grads = None + self._train_program = None + + self._learning_rate = optimizer._learning_rate + self._learning_rate_map = optimizer._learning_rate_map + self._use_pure_bf16 = use_pure_bf16 + self._use_bf16_guard = use_bf16_guard + self._to_bf16_var_names = None + + def _init_amp_var(self): + # Ensure the data type of learning rate vars is float32 (same as the + # master parameter dtype) + if isinstance(self._optimizer._learning_rate, float): + self._optimizer._learning_rate_map[default_main_program()] = \ + layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._optimizer._learning_rate), + dtype='float32', + persistable=True) + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + Backward propagation or auto differentiation for gradients' computation. + + Args: + loss (Variable): The loss Variable to minimize. + startup_program (Program|None): The startup Program for initializing + parameters in `parameter_list`. + parameter_list (list|None): A list of Variables to update. + no_grad_set (set|None): A set of Variables should be ignored. + callbacks (list|None): A list of callable objects to run when appending + backward operator for one parameter. + + Returns: + A list of (param, grad), which is a tuple of a parameter and its + gradient respectively, and the scaled loss. + """ + train_program = loss.block.program + self._train_program = train_program + + with program_guard(self._train_program, startup_program): + self._init_amp_var() + + if self._use_pure_bf16: + self._to_bf16_var_names = cast_model_to_bf16( + self._train_program, self._amp_lists, self._use_bf16_guard) + else: + rewrite_program_bf16(self._train_program, self._amp_lists) + + if loss.dtype != core.VarDesc.VarType.FP32: + loss = loss.astype('float32') + + params_grads = self._optimizer.backward( + loss, startup_program, parameter_list, no_grad_set, callbacks) + return params_grads + + def amp_init(self, + place, + scope=None, + test_program=None, + use_bf16_test=False): + """ + Init the amp training, such as cast fp32 parameters to bf16 type. + + Args: + place(CPUPlace): place is used to initialize + bf16 parameters with fp32 values. + scope(Scope): The scope is used to find fp32 parameters. + test_program(Program): The program is used for testing. + use_bf16_test(bool): Whether to use bf16 testing. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + import paddle.nn.functional as F + paddle.enable_static() + + def run_example_code(): + place = paddle.CPUPlace(0) + exe = paddle.static.Executor(place) + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + # 1) Use bf16_guard to control the range of bf16 kernels used. + with paddle.static.amp.bf16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) + # 2) Create the optimizer and set `multi_precision` to True. + # Setting `multi_precision` to True can avoid the poor accuracy + # or the slow convergence in a way. + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + # 3) These ops in `custom_fp32_list` will keep in the float32 computation type. + amp_list = paddle.static.amp.CustomOpLists( + custom_fp32_list=['pool2d']) + # 4) The entry of Paddle AMP. + # Enable pure bf16 training by setting `use_pure_bf16` to True. + optimizer = paddle.static.amp.bf16.decorate_bf16( + optimizer, + amp_list, + use_pure_bf16=True) + # If you don't use the default_startup_program(), you sholud pass + # your defined `startup_program` into `minimize`. + optimizer.minimize(loss) + exe.run(paddle.static.default_startup_program()) + # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + optimizer.amp_init(place, scope=paddle.static.global_scope()) + + """ + assert self._train_program is not None, \ + "Please call the minimize method first." + if self._use_pure_bf16: + cast_parameters_to_bf16(place, self._train_program, scope, + self._to_bf16_var_names) + if test_program is not None: + if self._use_pure_bf16: + cast_model_to_bf16(test_program, self._amp_lists, + self._use_bf16_guard) + elif use_bf16_test: + rewrite_program_bf16(test_program, self._amp_lists) + + def apply_gradients(self, params_grads): + """ + Apply gradients. + + Args: + params_grads (list): A list of params. + + Returns: + A list of optimize operators. + """ + + return self._optimizer.apply_gradients(params_grads) + + def apply_optimize(self, loss, startup_program, params_grads): + program = loss.block.program + with program_guard(program, startup_program): + optimize_ops = self.apply_gradients(params_grads) + return optimize_ops + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + Perform optimization by minimizing the given loss. + + Args: + loss (Variable): The loss Variable. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + + Returns: + The scaled loss by scaling factor, the list of optimize ops, and a + list of scaled parameters and gradients. + """ + opt_dict = self._optimizer.__class__.__dict__ + if 'minimize' in opt_dict and isinstance(opt_dict['minimize'], + types.FunctionType): + warnings.warn( + "The decorated optimizer has its own `minimize` method, but it will not be executed." + ) + + params_grads = self.backward( + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set) + + optimize_ops = self.apply_optimize(loss, startup_program, params_grads) + + return optimize_ops, params_grads + + +def decorate_bf16(optimizer, + amp_lists=None, + use_pure_bf16=False, + use_bf16_guard=None): + """ + Decorate the given optimizer to adapt to the mixed-precision training. + + Args: + optimizer(Optimizer): A common Optimizer. + amp_lists (CustomOpLists): An CustomOpLists object. + use_pure_bf16(bool): Whether to use the pure bf16 training. Default False. + use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program. + Default None, which means that its value equals to `use_pure_bf16`. + + Returns: + An optimizer acting like a normal one but with mixed-precision training + enabled. + + Examples 1: + .. code-block:: python + + # fp32&bf16 list based strategy example + import paddle + import paddle.static as static + + paddle.enable_static() + + data = static.data(name='X', shape=[None, 1], dtype='float32') + hidden = static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) + + mp_optimizer = static.amp.decorate_bf16(optimizer=optimizer) + + ops, param_grads = mp_optimizer.minimize(loss) + + Examples 2: + .. code-block:: python + + # pure bf16 training example + import numpy as np + import paddle + import paddle.nn.functional as F + + def run_example_code(): + place = paddle.CPUPlace(0) + exe = paddle.static.Executor(place) + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + # 1) Use bf16_guard to control the range of bf16 kernels used. + with paddle.static.amp.bf16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) + # 2) Create the optimizer and set `multi_precision` to True. + # Setting `multi_precision` to True can avoid the poor accuracy + # or the slow convergence in a way. + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + # 3) These ops in `custom_fp32_list` will keep in the float32 computation type. + amp_list = paddle.static.amp.CustomOpLists( + custom_fp32_list=['pool2d']) + # 4) The entry of Paddle AMP. + # Enable pure bf16 training by setting `use_pure_bf16` to True. + optimizer = paddle.static.amp.decorate_bf16( + optimizer, + amp_list, + use_pure_bf16=True) + # If you don't use the default_startup_program(), you sholud pass + # your defined `startup_program` into `minimize`. + optimizer.minimize(loss) + exe.run(paddle.static.default_startup_program()) + # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + optimizer.amp_init(place, scope=paddle.static.global_scope()) + + """ + if amp_lists is None: + amp_lists = AutoMixedPrecisionListsBF16() + + if use_bf16_guard is None: + use_bf16_guard = use_pure_bf16 + + mp_optimizer = OptimizerWithMixedPrecision(optimizer, amp_lists, + use_pure_bf16, use_bf16_guard) + + return mp_optimizer diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py index faf2307f814..2969b7ea11d 100644 --- a/python/paddle/fluid/contrib/tests/test_bf16_utils.py +++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py @@ -14,7 +14,7 @@ import copy import unittest import paddle.fluid as fluid -import paddle.fluid.contrib.mixed_precision as amp +import paddle.static.amp as amp from paddle.fluid import core import paddle @@ -34,34 +34,34 @@ class AMPTest(unittest.TestCase): self.assertEqual(self.amp_lists_.gray_list, self.gray_list) def test_amp_lists(self): - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16() + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16() def test_amp_lists_1(self): # 1. w={'exp}, b=None self.bf16_list.add('exp') self.fp32_list.remove('exp') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'exp'}) + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'exp'}) def test_amp_lists_2(self): # 2. w={'tanh'}, b=None self.fp32_list.remove('tanh') self.bf16_list.add('tanh') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'tanh'}) + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'tanh'}) def test_amp_lists_3(self): # 3. w={'lstm'}, b=None self.bf16_list.add('lstm') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'lstm'}) + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'lstm'}) def test_amp_lists_4(self): # 4. w=None, b={'elementwise_add'} self.bf16_list.remove('elementwise_add') self.fp32_list.add('elementwise_add') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'elementwise_add'}) def test_amp_lists_5(self): @@ -69,28 +69,28 @@ class AMPTest(unittest.TestCase): self.fp32_list.add('elementwise_add') self.bf16_list.remove('elementwise_add') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'elementwise_add'}) def test_amp_lists_6(self): # 6. w=None, b={'lstm'} self.fp32_list.add('lstm') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'lstm'}) def test_amp_lists_7(self): self.fp32_list.add('reshape2') self.gray_list.remove('reshape2') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'reshape2'}) def test_amp_list_8(self): self.bf16_list.add('reshape2') self.gray_list.remove('reshape2') - self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16( custom_bf16_list={'reshape2'}) @@ -98,7 +98,7 @@ class AMPTest2(unittest.TestCase): def test_amp_lists_(self): # 7. w={'lstm'} b={'lstm'} # raise ValueError - self.assertRaises(ValueError, amp.AutoMixedPrecisionListsBF16, + self.assertRaises(ValueError, amp.bf16.AutoMixedPrecisionListsBF16, {'lstm'}, {'lstm'}) def test_find_op_index(self): @@ -117,10 +117,10 @@ class AMPTest2(unittest.TestCase): type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]}) op2 = block.append_op( type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]}) - amp_lists_1 = amp.AutoMixedPrecisionListsBF16( + amp_lists_1 = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_varnames={'X'}) assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1) - amp_lists_2 = amp.AutoMixedPrecisionListsBF16( + amp_lists_2 = amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_varnames={'Y'}) assert amp.bf16.amp_utils._is_in_fp32_varnames(op2, amp_lists_2) assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_2) diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py index 40ddcf2e66b..af2c42d6b85 100644 --- a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py +++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py @@ -65,13 +65,13 @@ class TestModelCastBF16(unittest.TestCase): fetch_list=fetch_list, return_numpy=(not with_lod)) - def test_graph_rewrite(self): + def _graph_common(self, _amp_fun): size = 3 n = np.ones([size, size], dtype='float32') * 3.2 nn = np.ones([size, size], dtype='float32') * -2.7 - n_bf16 = amp.convert_float_to_uint16(n) - nn_bf16 = amp.convert_float_to_uint16(nn) + n_bf16 = amp.bf16.convert_float_to_uint16(n) + nn_bf16 = amp.bf16.convert_float_to_uint16(nn) with self.static_graph(): t_bf16 = layers.data( @@ -85,12 +85,12 @@ class TestModelCastBF16(unittest.TestCase): ret = layers.elementwise_mul(ret, t) ret = layers.reshape(ret, [0, 0]) - with amp.bf16_guard(): + with amp.bf16.bf16_guard(): ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16) ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16) ret_bf16 = layers.reshape(ret_bf16, [0, 0]) - with amp.bf16_guard(): + with amp.bf16.bf16_guard(): ret_fp32bf16 = layers.elementwise_add(t, tt) ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t) ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0]) @@ -103,7 +103,7 @@ class TestModelCastBF16(unittest.TestCase): 'tt_bf16': nn_bf16, }, fetch_list=[ret_bf16, ret, ret_fp32bf16], - amp_fun=lambda prog: amp.rewrite_program_bf16(prog, use_bf16_guard=True)) + amp_fun=lambda prog: amp.bf16.rewrite_program_bf16(prog)) self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2)) self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2)) @@ -112,7 +112,7 @@ class TestModelCastBF16(unittest.TestCase): t = layers.data(name='t', shape=[size, size], dtype='float32') tt = layers.data(name='tt', shape=[size, size], dtype='float32') - with amp.bf16_guard(): + with amp.bf16.bf16_guard(): ret = layers.elementwise_add(t, tt) ret = layers.reshape(ret, [0, 0], act='elu') ret = layers.elementwise_mul(ret, t) @@ -122,17 +122,27 @@ class TestModelCastBF16(unittest.TestCase): self.get_static_graph_result( feed={'t': n, 'tt': nn}, fetch_list=[ret], - amp_fun=lambda prog: amp.rewrite_program_bf16( - prog, - amp.AutoMixedPrecisionListsBF16( - custom_fp32_varnames={'elementwise_add_0.tmp_0'}), - use_bf16_guard=True - ) + amp_fun=_amp_fun ) self.assertTrue( static_ret_bf16, np.ones( [size, size], dtype='float32') * -1.1) + def test_graph_rewrite(self): + self._graph_common(lambda prog: amp.bf16.rewrite_program_bf16( + prog, + amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_varnames={'elementwise_add_0.tmp_0'}), + )) + + def test_graph_cast(self): + self._graph_common(lambda prog: amp.bf16.cast_model_to_bf16( + prog, + amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_list={'elementwise_mul'}), + use_bf16_guard=True + )) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e5663d607aa..751b6251565 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -332,7 +332,8 @@ def fc(input, for i, input_x in enumerate(input): check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc') dtype = helper.input_dtype() - check_dtype(dtype, 'input', ['float16', 'float32', 'float64'], 'fc') + check_dtype(dtype, 'input', ['float16', 'uint16', 'float32', 'float64'], + 'fc') mul_results = [] for input_var, param_attr in helper.iter_inputs_and_params(): input_shape = input_var.shape diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index a7ec339bf74..7dcce5efcfc 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -582,10 +582,9 @@ def assign(input, output=None): input = numpy.array(input) if isinstance(input, Variable): - check_dtype( - input.dtype, 'input', - ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'], - 'assign', '(When the type of input in assign is Variable.)') + check_dtype(input.dtype, 'input', [ + 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool' + ], 'assign', '(When the type of input in assign is Variable.)') if output is None: output = helper.create_variable_for_type_inference( dtype=input.dtype) diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index df43d9366ff..1172ae0f0ea 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -16,6 +16,8 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import paddle.static.amp as amp + import contextlib import numpy import unittest @@ -26,19 +28,34 @@ import os paddle.enable_static() -def train(use_cuda, save_dirname, is_local, use_bf16): +def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): x = fluid.layers.data(name='x', shape=[13], dtype='float32') - - y_predict = fluid.layers.fc(input=x, size=1, act=None) - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + if use_bf16: + if not pure_bf16: + with amp.bf16.bf16_guard(): + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + else: + y_predict = fluid.layers.fc(input=x, size=1, act=None) + with amp.bf16.bf16_guard(): + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + else: + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + if use_bf16: - paddle.static.amp.rewrite_program_bf16(fluid.default_main_program()) + sgd_optimizer = amp.bf16.decorate_bf16( + sgd_optimizer, + amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(), + use_bf16_guard=False, + use_pure_bf16=pure_bf16) sgd_optimizer.minimize(avg_cost) BATCH_SIZE = 20 @@ -54,6 +71,10 @@ def train(use_cuda, save_dirname, is_local, use_bf16): def train_loop(main_program): feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) exe.run(fluid.default_startup_program()) + test_prog = main_program.clone(for_test=True) + if pure_bf16: + sgd_optimizer.amp_init( + exe.place, test_program=test_prog, use_bf16_test=True) PASS_NUM = 100 for pass_id in range(PASS_NUM): @@ -61,9 +82,8 @@ def train(use_cuda, save_dirname, is_local, use_bf16): avg_loss_value, = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) - print(avg_loss_value) - if avg_loss_value[0] < 10.0: - if save_dirname is not None: + if avg_loss_value[0] < 10.0 or pure_bf16: + if save_dirname is not None and not pure_bf16: fluid.io.save_inference_model(save_dirname, ['x'], [y_predict], exe) return @@ -97,7 +117,7 @@ def train(use_cuda, save_dirname, is_local, use_bf16): train_loop(t.get_trainer_program()) -def infer(use_cuda, save_dirname=None): +def infer(use_cuda, save_dirname=None, use_bf16=False): if save_dirname is None: return @@ -135,7 +155,7 @@ def infer(use_cuda, save_dirname=None): print("ground truth: ", test_label) -def main(use_cuda, is_local=True, use_bf16=False): +def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False): if use_cuda and not fluid.core.is_compiled_with_cuda(): return @@ -145,11 +165,22 @@ def main(use_cuda, is_local=True, use_bf16=False): # Directory for saving the trained model save_dirname = "fit_a_line.inference.model" - train(use_cuda, save_dirname, is_local, use_bf16) - infer(use_cuda, save_dirname) + train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16) + infer(use_cuda, save_dirname, use_bf16) + + +class TestFitALineBase(unittest.TestCase): + @contextlib.contextmanager + def program_scope_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield -class TestFitALine(unittest.TestCase): +class TestFitALine(TestFitALineBase): def test_cpu(self): with self.program_scope_guard(): main(use_cuda=False) @@ -158,20 +189,17 @@ class TestFitALine(unittest.TestCase): with self.program_scope_guard(): main(use_cuda=True) - @unittest.skipIf(not fluid.core.supports_bfloat16(), - "place does not support BF16 evaluation") + +@unittest.skipIf(not fluid.core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestFitALineBF16(TestFitALineBase): def test_bf16(self): with self.program_scope_guard(): main(use_cuda=False, use_bf16=True) - @contextlib.contextmanager - def program_scope_guard(self): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield + def test_pure_bf16(self): + with self.program_scope_guard(): + main(use_cuda=False, use_bf16=True, pure_bf16=True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py index ad7550fa9dd..f16592a55cf 100644 --- a/python/paddle/fluid/tests/book/test_word2vec_book.py +++ b/python/paddle/fluid/tests/book/test_word2vec_book.py @@ -44,7 +44,8 @@ def train(target, is_parallel, save_dirname, is_local=True, - use_bf16=False): + use_bf16=False, + pure_bf16=False): PASS_NUM = 100 EMBED_SIZE = 32 HIDDEN_SIZE = 256 @@ -107,7 +108,13 @@ def train(target, sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) if use_bf16: - paddle.static.amp.rewrite_program_bf16(fluid.default_main_program()) + sgd_optimizer = paddle.static.amp.bf16.decorate_bf16( + sgd_optimizer, + amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_list={'softmax', 'concat'}, ), + use_bf16_guard=False, + use_pure_bf16=pure_bf16) + sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( @@ -121,6 +128,8 @@ def train(target, def train_loop(main_program): exe.run(fluid.default_startup_program()) + if pure_bf16: + sgd_optimizer.amp_init(exe.place) for pass_id in range(PASS_NUM): for data in train_reader(): @@ -128,7 +137,7 @@ def train(target, feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: - if save_dirname is not None: + if save_dirname is not None and not pure_bf16: fluid.io.save_inference_model(save_dirname, [ 'firstw', 'secondw', 'thirdw', 'forthw' ], [predict_word], exe) @@ -246,7 +255,7 @@ def infer(target, save_dirname=None): assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b) -def main(target, is_sparse, is_parallel, use_bf16): +def main(target, is_sparse, is_parallel, use_bf16, pure_bf16): if target == "cuda" and not fluid.core.is_compiled_with_cuda(): return if target == "xpu" and not fluid.core.is_compiled_with_xpu(): @@ -265,7 +274,13 @@ def main(target, is_sparse, is_parallel, use_bf16): # so only inference is turned on. train("cpu", is_sparse, is_parallel, save_dirname) else: - train(target, is_sparse, is_parallel, save_dirname, use_bf16=use_bf16) + train( + target, + is_sparse, + is_parallel, + save_dirname, + use_bf16=use_bf16, + pure_bf16=pure_bf16) infer(target, save_dirname) @@ -278,10 +293,15 @@ class W2VTest(unittest.TestCase): pass -def inject_test_method(target, is_sparse, is_parallel, use_bf16=False): +def inject_test_method(target, + is_sparse, + is_parallel, + use_bf16=False, + pure_bf16=False): fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse" if is_sparse else "dense", "parallel" - if is_parallel else "normal", "_bf16" + if is_parallel else "normal", + "_purebf16" if pure_bf16 else "_bf16" if use_bf16 else "") def __impl__(*args, **kwargs): @@ -290,7 +310,7 @@ def inject_test_method(target, is_sparse, is_parallel, use_bf16=False): scope = fluid.core.Scope() with fluid.scope_guard(scope): with fluid.program_guard(prog, startup_prog): - main(target, is_sparse, is_parallel, use_bf16) + main(target, is_sparse, is_parallel, use_bf16, pure_bf16) if (not fluid.core.is_compiled_with_cuda() or target == "cuda") and is_sparse: @@ -307,7 +327,8 @@ for target in ("cuda", "cpu", "xpu"): for is_sparse in (False, True): for is_parallel in (False, ): inject_test_method(target, is_sparse, is_parallel) -inject_test_method("cpu", False, False, use_bf16=True) +inject_test_method("cpu", False, False, True) +inject_test_method("cpu", False, False, True, True) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py index 69298f0f6a5..7caae211b7b 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py @@ -64,7 +64,7 @@ class SimpleNetWithCond(object): return grads - def build_net(self, cond_i): + def build_net(self, cond_i, use_bf16=False): """ pseudo code: sum_xy = x + y @@ -122,13 +122,22 @@ class SimpleNetWithCond(object): sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false) sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond]) mean_out = fluid.layers.mean(sum_all) + if use_bf16: + import paddle.static.amp as amp + self.optimizer = amp.bf16.decorate_bf16( + self.optimizer, + amp_lists=amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_list={'elementwise_add'}), + use_bf16_guard=False, + use_pure_bf16=True) + self.optimizer.minimize(mean_out) fetch_list = ["param_x", "param_z"] if self.y_no_grad else [ "param_x", "param_y", "param_z" ] fetch_list += [_append_grad_suffix_(param) for param in fetch_list] - return fetch_list + return fetch_list, self.optimizer class TestOptimizer(unittest.TestCase): @@ -180,7 +189,7 @@ class TestOptimizer(unittest.TestCase): for key in ['x', 'y', 'z']: self.param_attr[key] = self.attr.copy() - def _check_grads(self): + def _check_grads(self, use_bf16=False): """ main logic code to check the validity of apply_optimize. """ @@ -204,10 +213,16 @@ class TestOptimizer(unittest.TestCase): lambda: dict()) test_net = self.NetClass(self.optimizer, param_lr, y_no_grad) - fetch_list = test_net.build_net(cond_i) + fetch_list, decorated_optimizer = test_net.build_net( + cond_i, use_bf16) + if use_bf16: + self.optimizer = decorated_optimizer exe = fluid.Executor(place) exe.run(init_program) + if use_bf16: + self.optimizer.amp_init(exe.place) + # Train 2 steps to check validity for batch_i in range(2): @@ -222,6 +237,15 @@ class TestOptimizer(unittest.TestCase): param_grads[i]) +@unittest.skipIf(not fluid.core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestSGDOptimizer(TestOptimizer): + def test_optimizer_multiblock_except(self): + with self.assertRaisesRegexp(ValueError, + "var param_y not in this block"): + self._check_grads(use_bf16=True) + + class TestAdamOptimizer(TestOptimizer): """ inherit TestOptimizer and shall override two functions as follows: diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py index 54de11401f3..4c309edfeaf 100644 --- a/python/paddle/static/amp/__init__.py +++ b/python/paddle/static/amp/__init__.py @@ -18,9 +18,6 @@ from ...fluid.contrib.mixed_precision import AutoMixedPrecisionLists # noqa: F4 from ...fluid.contrib.mixed_precision import fp16_guard # noqa: F401 from ...fluid.contrib.mixed_precision import cast_model_to_fp16 # noqa: F401 from ...fluid.contrib.mixed_precision import cast_parameters_to_fp16 # noqa: F401 -from ...fluid.contrib.mixed_precision import AutoMixedPrecisionListsBF16 # noqa: F401 -from ...fluid.contrib.mixed_precision import bf16_guard # noqa: F401 -from ...fluid.contrib.mixed_precision import rewrite_program_bf16 # noqa: F401 -from ...fluid.contrib.mixed_precision import convert_float_to_uint16 # noqa: F401 +from ...fluid.contrib.mixed_precision import bf16 # noqa: F401 __all__ = [] -- GitLab