fp16_utils.py 32.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import logging
16

17
import numpy as np
18

19
import paddle
20 21 22 23
from paddle.fluid import core, framework, global_scope
from paddle.fluid.log_helper import get_logger
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager

24 25 26 27 28
from .fp16_lists import (
    AutoMixedPrecisionLists,
    black_list,
    get_low_precision_dtypestr,
)
29

30 31 32
_logger = get_logger(
    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
)
33

34
_valid_types = [
35 36 37
    core.VarDesc.VarType.LOD_TENSOR,
    core.VarDesc.VarType.SELECTED_ROWS,
    core.VarDesc.VarType.LOD_TENSOR_ARRAY,
38 39 40 41
]

_fp16_guard_pattern = "__use_fp16__"

42

J
Jie Fang 已提交
43 44
def _rename_arg(op, old_name, new_name):
    """
45
    If an op has old_name input and output, rename these input
J
Jie Fang 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59
    args new_name.

    Args:
        op (Operator): Current operator.
        old_name (str): The old name of input args.
        new_name (str): The new name of input args.
    """
    op_desc = op.desc
    if isinstance(op_desc, tuple):
        op_desc = op_desc[0]
    op_desc._rename_input(old_name, new_name)
    op_desc._rename_output(old_name, new_name)


60 61 62 63 64 65 66 67 68 69 70 71
def _rename_op_input(program, op_var_rename_map, origin_ops, keep_fp32_ops):
    for block in program.blocks:
        ops = block.ops
        block_id = block.idx
        for op in ops:
            if op not in origin_ops or op in keep_fp32_ops:
                continue
            for name in op.input_arg_names:
                if name in op_var_rename_map[block_id]:
                    op._rename_input(name, op_var_rename_map[block_id][name])


J
Jie Fang 已提交
72 73 74 75 76 77 78
def _dtype_to_str(dtype):
    """
    Convert specific variable type to its corresponding string.

    Args:
        dtype (VarType): Variable type.
    """
79 80 81
    if dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
        # TODO(Xreki): change the returned str to "bf16" for BF16 data type.
        # Currently too many codes use "cast_fp16" as key.
J
Jie Fang 已提交
82 83 84 85 86
        return 'fp16'
    else:
        return 'fp32'


87 88 89 90 91 92 93 94 95 96 97 98 99 100
_keep_layer_norm_scale_bias_to_fp32_flag = True


def _keep_layer_norm_scale_bias_to_fp32(*args):
    global _keep_layer_norm_scale_bias_to_fp32_flag
    if len(args) == 0:
        return _keep_layer_norm_scale_bias_to_fp32_flag
    else:
        assert len(args) == 1 and isinstance(args[0], bool)
        old_value = _keep_layer_norm_scale_bias_to_fp32_flag
        _keep_layer_norm_scale_bias_to_fp32_flag = args[0]
        return old_value


101 102
def _keep_fp32_input(op, in_name):
    op_type = op.type
103
    if op_type == 'batch_norm':
104 105
        # Scale, Bias, Mean, Variance should be float32.
        return in_name != 'X'
106 107
    if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
        return in_name != 'X'
108 109
    if op_type == 'instance_norm':
        return in_name != 'X'
110 111 112 113
    if op_type == 'fused_bn_add_activation':
        return in_name not in {'X', 'Z'}
    if op_type == 'resnet_unit':
        return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'}
114 115
    if op_type in ['fused_attention', 'fused_feedforward']:
        return in_name in {
116 117 118 119 120 121
            'LnScale',
            'LnBias',
            'Ln2Scale',
            'Ln2Bias',
            "Ln1Scale",
            "Ln1Bias",
122
        }
123 124
    if op_type == 'fused_multi_transformer':
        return in_name in {'LnScale', 'LnBias', 'FFNLnScale', 'FFNLnBias'}
125 126 127 128 129
    return False


def _keep_fp32_output(op, out_name):
    op_type = op.type
130 131 132
    if op_type in ['batch_norm', 'fused_bn_add_activation']:
        return out_name != 'Y'
    if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
133 134 135
        return out_name != 'Y'
    if op_type == 'resnet_unit':
        return out_name not in {'Y', 'ConvX', 'ConvZ'}
136 137
    if op_type in ['fused_attention', 'fused_feedforward']:
        return out_name in {
138 139 140 141 142 143
            'LnMean',
            'LnVariance',
            'Ln2Mean',
            'Ln2Variance',
            'Ln1Mean',
            'Ln1Variance',
144
        }
145 146 147
    return False


J
Jie Fang 已提交
148 149
def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
    """
150
    Insert cast op and rename op's input.
J
Jie Fang 已提交
151 152 153 154 155 156

    Args:
        block (Program): The block in which the operator is.
        op (Operator): The operator to insert cast op.
        idx (int): The index of current operator.
        src_dtype (VarType): The input variable dtype of cast op.
Z
Zhen Wang 已提交
157
        dest_dtype (VarType): The output variable dtype of cast op.
J
Jie Fang 已提交
158 159 160 161 162

    Returns:
        num_cast_op (int): The number of cast ops that have been inserted.
    """
    num_cast_ops = 0
163

J
Jie Fang 已提交
164
    for in_name in op.input_names:
165
        if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
166 167
            op, in_name
        ):
168
            continue
J
Jie Fang 已提交
169
        for in_var_name in op.input(in_name):
H
huangxu96 已提交
170
            in_var = block._find_var_recursive(in_var_name)
171
            if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
J
Jie Fang 已提交
172
                continue
173 174 175 176 177 178 179 180
            # op's input is already casted to dest_dtype before. Set the in_var.name to cast_name.
            cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
            casted_var = block._find_var_recursive(cast_name)
            if casted_var and casted_var.dtype == dest_dtype:
                _rename_arg(op, in_var.name, casted_var.name)
                continue

            # insert cast for op's input.
J
Jie Fang 已提交
181
            if in_var.dtype == src_dtype:
182 183
                out_var = block.vars.get(cast_name)
                if out_var is None or out_var.dtype != dest_dtype:
184 185 186 187 188 189
                    op_device = op.attr('op_device')
                    # NOTE(wangxi): optimize for pipeline, reduce one send.
                    # if in_var is stop_gradient and prev_op device is `all`,
                    # set cast_op device to `all`, can reduce send cast_var.
                    # TODO: need remove this after we unified the dynamic
                    # and static pipeline interface.
190 191 192 193
                    if (
                        src_dtype == core.VarDesc.VarType.FP32
                        and in_var.stop_gradient
                    ):
194 195
                        prev_op = None
                        if in_var.op is op:
196 197 198
                            prev_op = find_true_prev_op(
                                block.ops, op, in_var_name
                            )
199 200 201 202 203 204 205
                        elif in_var.op is not None:
                            prev_op = in_var.op

                        prev_op_device = None
                        if prev_op is not None:
                            prev_op_device = prev_op.attr('op_device')

206 207 208 209
                        if (
                            prev_op_device is not None
                            and 'all' in prev_op_device
                        ):
210 211
                            op_device = prev_op_device

212 213 214 215
                    out_var = block.create_var(
                        name=cast_name,
                        dtype=dest_dtype,
                        persistable=False,
216 217 218
                        stop_gradient=in_var.stop_gradient,
                    )

219 220 221 222 223 224 225
                    # Only forward program will be inserted cast op, but some ops
                    # has no op_role attr, so here set it direcly. eg. resnet_unit.
                    op_role = (
                        int(core.op_proto_and_checker_maker.OpRole.Forward)
                        if not op.has_attr('op_role')
                        else op.attr('op_role')
                    )
226 227 228 229 230 231 232 233 234
                    block._insert_op_without_sync(
                        idx,
                        type="cast",
                        inputs={"X": in_var},
                        outputs={"Out": out_var},
                        attrs={
                            "in_dtype": in_var.dtype,
                            "out_dtype": out_var.dtype,
                            "op_device": op_device,
235
                            "op_role": op_role,
236 237
                        },
                    )
238
                    num_cast_ops += 1
J
Jie Fang 已提交
239 240
                _rename_arg(op, in_var.name, out_var.name)

241 242 243
    for attr_name in ['in_dtype', 'out_dtype', 'dtype']:
        if op.has_attr(attr_name) and is_float_dtype(op.attr(attr_name)):
            op._set_attr(attr_name, dest_dtype)
244 245 246 247

    return num_cast_ops


248 249 250 251 252 253 254 255 256 257
def find_true_prev_op(ops, cur_op, var_name):
    """
    Find the true prev op that outputs var_name variable.

    Args:
        ops (list): A list of ops.
        cur_op (Operator): Current operator which has var_name variable.
        var_name (string): Variable name.
    """
    prev_op = []
J
Jie Fang 已提交
258
    for op in ops:
259 260
        if op == cur_op:
            break
J
Jie Fang 已提交
261 262 263
        for out_name in op.output_names:
            for out_var_name in op.output(out_name):
                if out_var_name == var_name:
264 265 266
                    prev_op.append(op)
    if prev_op:
        if not len(prev_op) == 1:
267 268
            raise ValueError(
                "There must be only one previous op "
269
                f"that outputs {var_name} variable"
270
            )
271 272 273
        else:
            return prev_op[0]
    return None
J
Jie Fang 已提交
274 275


276
def find_true_post_op(ops, cur_op, var_name, search_all=False):
M
mapingshuo 已提交
277 278 279 280 281 282 283
    """
    if there are post ops, return them, if there is no post op,
    return None instead.
    Args:
        ops (list): A list of ops.
        cur_op (Operator): Current operator which has var_name variable.
        var_name (string): Variable name.
284
        search_all (bool): The type of operator search. Use if \"cur_op\" is not in the \"ops\" set.
M
mapingshuo 已提交
285 286
    """
    post_op = []
287 288
    if search_all:
        """
289 290 291 292 293
        \"cur_op\" do not have to be in list of \"ops\". E.g. \"cur_op\" can come
        from startup_prog block and \"ops\" list from main_prog block.
        By setting idx to -1, we'll start looking for post-ops from the top of the list.
        If search_all is False, assume that \"cur_op\" is in \"ops\" list,
        so to reduce the time of search we can start iterating from \"cur_op\" idx.
294 295 296 297 298 299
        """
        idx = -1
    else:
        for idx, op in enumerate(ops):
            if op == cur_op:
                break
M
mapingshuo 已提交
300 301 302 303 304 305 306

    for i in range(idx + 1, len(ops)):
        op = ops[i]
        for in_name in op.input_names:
            for in_var_name in op.input(in_name):
                if in_var_name == var_name:
                    post_op.append(op)
307 308

    return post_op
M
mapingshuo 已提交
309 310 311


def find_op_index(block_desc, cur_op_desc):
312
    """ """
M
mapingshuo 已提交
313 314 315 316 317 318
    for idx in range(block_desc.op_size()):
        if cur_op_desc == block_desc.op(idx):
            return idx
    return -1


319 320 321 322 323 324 325 326 327 328 329 330
def _is_in_black_varnames(op, amp_lists):
    for in_name in op.input_arg_names:
        if in_name in amp_lists.black_varnames:
            return True

    for out_name in op.output_arg_names:
        if out_name in amp_lists.black_varnames:
            return True

    return False


331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
def _need_keep_fp32(op, unsupported_op_list, use_fp16_guard):
    if op.type in unsupported_op_list:
        # the highest priority condition: If ops don't have fp16 computing kernels,
        # they must be executed in fp32 calculation pattern.
        return True

    # process ops about learning rate
    in_out_arg_names = []
    in_out_arg_names.extend(list(op.input_arg_names))
    in_out_arg_names.extend(list(op.output_arg_names))
    for name in in_out_arg_names:
        if "learning_rate" in name:
            return True

    if use_fp16_guard:
346 347 348
        if op.has_attr("op_namescope") and (
            _fp16_guard_pattern in op.attr("op_namescope")
        ):
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
            # op in fp16 guard
            return False
        else:
            # op not in fp16 guard
            return True
    else:
        return False


@signature_safe_contextmanager
def fp16_guard():
    """
    As for the pure fp16 training, if users set `use_fp16_guard` to True,
    only those ops created in the context manager `fp16_guard` will be
    transformed as float16 type.
H
huangxu96 已提交
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379

    Examples:
        .. code-block:: python

            import numpy as np
            import paddle
            import paddle.nn.functional as F
            paddle.enable_static()
            data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
            conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)

            with paddle.static.amp.fp16_guard():
                bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                hidden = paddle.static.nn.fc(pool, size=10)
                loss = paddle.mean(hidden)
380 381 382 383 384
    """
    with framework.name_scope(prefix=_fp16_guard_pattern):
        yield


385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
def is_float_dtype(dtype):
    return (
        dtype == core.VarDesc.VarType.FP32
        or dtype == core.VarDesc.VarType.FP16
        or dtype == core.VarDesc.VarType.BF16
        or dtype == core.VarDesc.VarType.FP64
    )


def set_var_dst_dtype(
    op, var_names, block, global_block, dtype, need_set_dtype
):
    low_precison_var_names = set()
    for var_name in var_names:
        var = None
        try:
            var = block._var_recursive(var_name)
        except ValueError as e:
            _logger.debug(f"-- {e}, try to get it in the global block --")
            var = global_block.var(var_name)
            if var is not None:
                _logger.debug(
                    f"-- var {var_name} is got in the global block --"
                )

        if var is None or var.type not in _valid_types:
            continue

        if is_float_dtype(var.dtype):
            low_precison_var_names.add(var_name)
            if need_set_dtype:
                var.desc.set_dtype(dtype)

        _logger.debug(
            "---- op type: {}, var name: {}, var dtype: {} ----".format(
                op.type, var_name, var.dtype
            )
        )

    return low_precison_var_names


def set_param_dtype(program, dtype, amp_lists, use_fp16_guard, level):
    keep_fp32_var_names = set()
Z
Zhang Ting 已提交
429 430
    if level == "O1":
        return keep_fp32_var_names
431 432 433 434 435
    all_parameters = []
    for block in program.blocks:
        all_parameters.extend(block.all_parameters())
        ops = block.ops
        for op in ops:
Z
Zhang Ting 已提交
436 437 438 439 440
            # Currently, lookup_table is in black_list and unsupport_list, it's weight will be
            # set to fp32 in setp 1 of cast_model_tp_fp16. But the weight may be used as matmul's
            # input in transformer, so the weight is also in to_fp16_var_names.
            # TODO(zhangting2020): consider fix auto_parallel_fp16 and remove lookup_table
            # from black_list and unsupport_list.
441
            if op.type in amp_lists.black_list:
Z
Zhang Ting 已提交
442 443
                continue
            if _need_keep_fp32(op, amp_lists.unsupported_list, use_fp16_guard):
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
                for in_name in op.input_names:
                    keep_fp32_var_names = keep_fp32_var_names.union(
                        op.input(in_name)
                    )
            else:
                for in_name in op.input_names:
                    if not core.is_compiled_with_ipu() and _keep_fp32_input(
                        op, in_name
                    ):
                        keep_fp32_var_names = keep_fp32_var_names.union(
                            op.input(in_name)
                        )

    for param in all_parameters:
        if param.name not in keep_fp32_var_names:
            _logger.debug(f"-- set param {param.name} to {dtype} --.")
            param.desc.set_dtype(dtype)
Z
Zhang Ting 已提交
461
    return keep_fp32_var_names
462 463


464
def op_need_keep_fp32(op, amp_lists, use_fp16_guard, params_list):
465
    need_keep_fp32 = False
466
    fp16_varname_list_in_fp32_op = set()
467 468 469 470 471 472 473 474 475 476 477 478
    if _need_keep_fp32(
        op,
        amp_lists.unsupported_list,
        use_fp16_guard,
    ):
        need_keep_fp32 = True
    elif amp_lists.black_varnames is not None and _is_in_black_varnames(
        op, amp_lists
    ):
        need_keep_fp32 = True
    elif op.type in amp_lists.black_list:
        need_keep_fp32 = True
479 480 481 482 483 484
        for in_name in op.input_names:
            for params in params_list:
                if op.input(in_name)[0] == params.name:
                    fp16_varname_list_in_fp32_op = (
                        fp16_varname_list_in_fp32_op.union(op.input(in_name))
                    )
485

486
    return need_keep_fp32, fp16_varname_list_in_fp32_op
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590


def get_promote_dtype(op, amp_dtype, block):
    dst_dtype = amp_dtype
    for in_name in op.input_names:
        # for ipu, all inputs must be converted to fp16
        if not core.is_compiled_with_ipu() and _keep_fp32_input(op, in_name):
            _logger.debug(
                "---- Input {} {} should be kept fp32 ----".format(
                    in_name, op.input(in_name)
                )
            )
            continue
        # if this op has inputs
        if in_name:
            for in_var_name in op.input(in_name):
                in_var = block._find_var_recursive(in_var_name)
                if in_var and in_var.dtype == core.VarDesc.VarType.FP32:
                    dst_dtype = core.VarDesc.VarType.FP32
                    break
        else:
            dst_dtype = core.VarDesc.VarType.FP32

    return dst_dtype


def get_amp_dst_dtype(
    op, amp_dtype, level, block, amp_lists, keep_fp32_ops, keep_fp16_ops
):
    if level == 'O2':
        return amp_dtype

    ops = block.ops
    dst_dtype = amp_dtype
    if op.type in amp_lists.gray_list:
        keep_fp32 = False
        keep_fp16 = False
        for in_name in op.input_names:
            # if this op has inputs
            if in_name:
                for in_var_name in op.input(in_name):
                    in_var = block._find_var_recursive(in_var_name)
                    # this in_var isn't the output of other op
                    if in_var.op is None:
                        continue
                    elif in_var.op is op:
                        prev_op = find_true_prev_op(ops, op, in_var_name)
                        if prev_op is None:
                            continue
                    else:
                        prev_op = in_var.op

                    # if it's one of inputs
                    if (
                        prev_op in keep_fp32_ops
                        or prev_op.type in amp_lists.black_list
                    ):
                        dst_dtype = core.VarDesc.VarType.FP32
                    elif (
                        prev_op in keep_fp16_ops
                        or prev_op.type in amp_lists.white_list
                    ):
                        dst_dtype = amp_dtype
    else:
        # For numerical safe, we apply fp32 computation on ops that
        # are not determined which list they should stay.
        dst_dtype = core.VarDesc.VarType.FP32
    return dst_dtype


def process_op_input_and_outputs(op, block, global_block, dtype):
    low_precison_var_names = set()
    # Get the FP16 input because the low_precison_var_names is required for the parameter casting.
    # The dtype of the input is not set to fp16, because it is done in the step 3 of cast_model_to_fp16.
    for in_name in op.input_names:
        # for ipu, all inputs must be converted to fp16
        if not core.is_compiled_with_ipu() and _keep_fp32_input(op, in_name):
            continue
        in_vars = set_var_dst_dtype(
            op,
            op.input(in_name),
            block,
            global_block,
            dtype,
            need_set_dtype=False,
        )
        low_precison_var_names = low_precison_var_names.union(in_vars)
    # Set the output to FP16 because its consumer OP needs to determine if the dtype needs
    # to be promoted.
    for out_name in op.output_names:
        # for ipu, all outputs must be converted to fp16
        if not core.is_compiled_with_ipu() and _keep_fp32_output(op, out_name):
            continue
        set_var_dst_dtype(
            op,
            op.output(out_name),
            block,
            global_block,
            dtype,
            need_set_dtype=True,
        )
    return low_precison_var_names


591 592 593 594 595
def cast_model_to_fp16(
    program,
    amp_lists=None,
    use_fp16_guard=True,
    dest_type=core.VarDesc.VarType.FP16,
596 597
    level='O2',
    use_promote=False,
598
):
599 600 601 602 603 604
    """
    Traverse all ops in the whole model and set their inputs and outputs
    to the fp16 data type. This function will do some special process for
    the batch normalization, which keeps the computational process of
    batchnorms in FP32.
    Args:
605 606 607 608
        program (Program): The used program.
        amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object.
        use_fp16_guard(bool): Determine whether to use `fp16_guard` when
                              constructing the program. Default True.
609
        dest_type(core.VarDesc.VarType): the cast type. such as core.VarDesc.VarType.FP16 and core.VarDesc.VarType.BF16.
610
    """
611 612
    _logger.debug("---- before cast model to fp16 ----")
    _logger.debug(program)
613
    if amp_lists is None:
614 615
        dtype = get_low_precision_dtypestr(dest_type)
        amp_lists = AutoMixedPrecisionLists(dtype)
616 617 618 619 620

    # For amp o2 there is no blacklist by default.
    if level == 'O2':
        amp_lists.black_list = amp_lists.black_list - black_list

621 622
    global_block = program.global_block()
    keep_fp32_ops = set()
623
    keep_fp16_ops = set()
624
    to_fp16_var_names = set()
Z
Zhang Ting 已提交
625
    keep_fp32_var_names = set()
626

627
    # step 1: set params dtype.
Z
Zhang Ting 已提交
628
    fp32_var_names = set_param_dtype(
629 630 631 632 633 634
        program,
        dtype=dest_type,
        amp_lists=amp_lists,
        use_fp16_guard=use_fp16_guard,
        level=level,
    )
Z
Zhang Ting 已提交
635
    keep_fp32_var_names = keep_fp32_var_names.union(fp32_var_names)
636 637 638

    def need_process(op):
        need_process = True
Z
Zhang Ting 已提交
639
        if op.type in ["create_py_reader", "read"]:
640 641 642
            need_process = False
        else:
            for attr_name in ['out_dtype', 'dtype']:
Z
Zhang Ting 已提交
643 644 645 646 647
                # output type of some operators such as fill_constant will be determined by the attribute value.
                #
                if not op.has_attr('in_dtype') and (
                    op.has_attr(attr_name)
                    and is_float_dtype(op.attr(attr_name))
648 649 650 651 652 653
                ):
                    need_process = False

        return need_process

    # step 2: divide op into different sets according to the black/unsupported and white lists.
654
    for block in program.blocks:
655 656
        ops = block.ops
        for op in ops:
657 658 659
            _logger.debug(f"-- process op: {op}  --")
            if not need_process(op):
                _logger.debug("---- The op does not need to be processed ----.")
660
                continue
661 662 663 664 665 666 667 668
            all_params = global_block.all_parameters()
            op_keep_fp32, fp16_var_names_in_fp32_op = op_need_keep_fp32(
                op, amp_lists, use_fp16_guard, all_params
            )
            to_fp16_var_names = to_fp16_var_names.union(
                fp16_var_names_in_fp32_op
            )
            if op_keep_fp32:
669
                keep_fp32_ops.add(op)
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
                process_op_input_and_outputs(
                    op, block, global_block, core.VarDesc.VarType.FP32
                )
                _logger.debug(
                    "---- Add into keep_fp32_ops because the op needs to be kept fp32 ----"
                )
            elif op.type in amp_lists.white_list:
                keep_fp16_ops.add(op)
                # get fp16 inputs and set op's outputs to fp16 for promote judgments
                fp16_var_names = process_op_input_and_outputs(
                    op, block, global_block, dest_type
                )
                to_fp16_var_names = to_fp16_var_names.union(fp16_var_names)
                _logger.debug(
                    "---- Add into keep_fp16_ops because the op in white_list ----"
                )
            else:
Z
Zhang Ting 已提交
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
                # if cast in orgin program, we only modifiy attr and output's dtype to avoid dtype mismatch errors.
                if op.type == 'cast':
                    in_var = block._find_var_recursive(op.input('X')[0])
                    out_var = block._find_var_recursive(op.output('Out')[0])
                    op._set_attr('in_dtype', in_var.dtype)
                    out_var.desc.set_dtype(paddle.dtype(op.attr('out_dtype')))
                    _logger.debug(
                        "---- op type: {}, in var [name: {} dtype: {}], out var [name: {} dtype: {}], attr [in_dtype {} out_dtype {}] ----".format(
                            op.type,
                            op.input('X')[0],
                            in_var.dtype,
                            op.output('Out')[0],
                            out_var.dtype,
                            op.attr('in_dtype'),
                            op.attr('out_dtype'),
                        )
                    )
                    continue
705 706 707 708 709 710 711 712 713 714 715 716 717 718
                # divide others ops into fp16/fp32 sets according to promoting principle.
                dst_dtype = dest_type
                if not use_promote:
                    dst_dtype = get_amp_dst_dtype(
                        op,
                        dest_type,
                        level,
                        block,
                        amp_lists,
                        keep_fp32_ops,
                        keep_fp16_ops,
                    )
                else:
                    dst_dtype = get_promote_dtype(op, dest_type, block)
719

720 721 722 723 724 725
                if dst_dtype == dest_type:
                    keep_fp16_ops.add(op)
                    fp16_var_names = process_op_input_and_outputs(
                        op, block, global_block, dest_type
                    )
                    to_fp16_var_names = to_fp16_var_names.union(fp16_var_names)
726
                    _logger.debug(
727 728 729 730 731 732
                        "----  Add into keep_fp16_ops because it should be promoted to fp16 ----"
                    )
                else:
                    keep_fp32_ops.add(op)
                    process_op_input_and_outputs(
                        op, block, global_block, core.VarDesc.VarType.FP32
733
                    )
734
                    _logger.debug(
735
                        "----  Add into keep_fp32_ops because it should be promoted to fp32 ----"
736
                    )
737

738
    # step 3: insert cast op for op's inputs.
739 740 741 742 743 744
    for block in program.blocks:
        ops = block.ops
        idx = 0
        while idx < len(ops):
            op = ops[idx]
            num_cast_ops = 0
745 746 747 748 749 750 751 752 753
            if op in keep_fp16_ops:
                in_var_cast_num = _insert_cast_op(
                    block,
                    op,
                    idx,
                    core.VarDesc.VarType.FP32,
                    dest_type,
                )
                num_cast_ops += in_var_cast_num
754
            if op in keep_fp32_ops:
755
                in_var_cast_num = _insert_cast_op(
756 757 758
                    block,
                    op,
                    idx,
759
                    dest_type,
760 761
                    core.VarDesc.VarType.FP32,
                )
762
                num_cast_ops += in_var_cast_num
763

764 765 766
            idx += num_cast_ops + 1
    _logger.debug("---- after cast model to fp16 ----")
    _logger.debug(program)
Z
Zhang Ting 已提交
767 768

    to_fp16_var_names.difference_update(keep_fp32_var_names)
769
    return to_fp16_var_names
770

771

772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
def _convert_float_to_bfloat16(place, fp32_array):
    paddle.disable_static()
    framework._set_expected_place(place)
    fp32_tensor = paddle.to_tensor(fp32_array)
    bf16_array = paddle.cast(fp32_tensor, paddle.bfloat16).numpy()
    paddle.enable_static()
    return bf16_array


def cast_parameters_to_fp16(
    place,
    program,
    scope=None,
    to_fp16_var_names=None,
    dest_type=core.VarDesc.VarType.FP16,
):
788
    """
789
    Traverse all parameters in the whole model and set them to the FP16 data type.
790 791
    Whereas, this function will keep parameters of batchnorms in FP32.
    Args:
792 793 794 795 796 797 798
        place(fluid.CPUPlace|fluid.CUDAPlace): `place` is used to restore the FP16 weight tensors.
        program (Program): The used program.
        scope(fluid.Scope, optional): `scope` is used to get the FP32 weight tensor values.
                                      Default is None.
        to_fp16_var_names(set|list, optional): The data types of vars in `to_fp16_var_names`
                                               will be set to FP16. Usually, it is the returned
                                               value of `cast_model_to_fp16` API.
799
        dest_type(core.VarDesc.VarType): the cast type. such as core.VarDesc.VarType.FP16 and core.VarDesc.VarType.BF16.
800
    """
801 802 803 804
    all_parameters = []
    for block in program.blocks:
        all_parameters.extend(block.all_parameters())

805
    dtype_str = get_low_precision_dtypestr(dest_type)
806 807
    fp16_var_names = to_fp16_var_names if to_fp16_var_names else set()
    var_scope = scope if scope else global_scope()
808
    for param in all_parameters:
809
        if param.name in fp16_var_names:
810 811 812
            _logger.debug(
                f"-- cast {param.name} to {dtype_str}, place is {place}"
            )
813 814 815 816 817 818 819 820 821 822
            if var_scope.find_var(param.name):
                param_t = var_scope.find_var(param.name).get_tensor()
                data = np.array(param_t)
                if dest_type == core.VarDesc.VarType.BF16:
                    bf16_data = _convert_float_to_bfloat16(place, data)
                    param_t.set(bf16_data, place)
                else:
                    param_t.set(np.float16(data), place)
            else:
                _logger.warning(f"Cannot find {param.name}")
823 824


825 826 827
def update_role_var_grad(main_prog, params_grads):
    """
    Update op_role_var attr for some ops to make sure the gradients
Z
Zhen Wang 已提交
828
    transferred across GPUs is FP16.
829 830 831 832 833 834 835 836 837 838
    1. Check whether the op that outputs gradient is cast or not.
    2. If op is cast and gradient is FP32, remove the op_role_var
       and find the prev op which outputs FP16 gradient
    3. Update the op_role_var of the prev op.

    Args:
        main_prog (Program): The main program for training.
        params_grads (list): A list of params and grads.
    """
    block = main_prog.global_block()
F
fangshuixun007 已提交
839
    block._sync_with_cpp()
840 841 842 843 844 845 846
    BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
    OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
    for p, g in params_grads:
        op = g.op
        if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
            role = op.attr('op_role')
            if role & int(BACKWARD) and op.has_attr('op_role_var'):
F
fangshuixun007 已提交
847
                op._remove_attr("op_role_var")
848
            else:
849
                raise ValueError(
850 851
                    f"The cast op {op} must be in BACKWARD role "
                    "and have op_role_var attr."
852
                )
853 854 855

            fp16_grad_name = op.input(op.input_names[0])[0]
            op_for_fp16_grad = find_true_prev_op(block.ops, op, fp16_grad_name)
856
            op_role_var_attr_name = (
857
                core.op_proto_and_checker_maker.kOpRoleVarAttrName()
858
            )
859 860 861 862 863
            attr_val = [p.name, fp16_grad_name]
            if op_for_fp16_grad.has_attr(op_role_var_attr_name):
                attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name))
            op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val)

Z
Zhen Wang 已提交
864 865
            # Maximize the all_reduce overlap, and perform the cast
            # operation after gradients transfer.
866
            op._set_attr('op_role', OPTIMIZE)
M
mapingshuo 已提交
867 868 869 870
            # optimize op should stay behind forward and backward ops
            if op == block.ops[-1]:
                continue
            post_ops = find_true_post_op(block.ops, op, g.name)
871
            if post_ops:
872
                raise ValueError(
873
                    f"The cast op {op}'s output should not be"
874
                    "used by a non-optimize op, however, it"
875
                    f"is used by {post_ops[0]}"
876
                )
877
            # add new op in the python and cpp at the same time
M
mapingshuo 已提交
878 879
            new_op_desc = block.desc.append_op()
            new_op_desc.copy_from(op.desc)
880 881 882 883 884 885 886 887
            new_op = framework.Operator(
                block=block,
                desc=new_op_desc,
                type=None,
                inputs=None,
                outputs=None,
                attrs=None,
            )
F
fangshuixun007 已提交
888
            block.ops.append(new_op)
M
mapingshuo 已提交
889 890
            op_idx = find_op_index(block.desc, op.desc)
            if op_idx == -1:
891
                raise ValueError(f"The op {op} is not in program")
F
fangshuixun007 已提交
892 893
            block._remove_op(op_idx, sync=False)
    block._sync_with_cpp()