fp16_utils.py 4.3 KB
Newer Older
T
typhoonzero 已提交
1 2 3
from __future__ import print_function
import paddle
import paddle.fluid as fluid
T
typhoonzero 已提交
4
import paddle.fluid.core as core
T
typhoonzero 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

def cast_fp16_to_fp32(i, o, prog):
    prog.global_block().append_op(
        type="cast",
        inputs={"X": i},
        outputs={"Out": o},
        attrs={
            "in_dtype": fluid.core.VarDesc.VarType.FP16,
            "out_dtype": fluid.core.VarDesc.VarType.FP32
        }
    )

def cast_fp32_to_fp16(i, o, prog):
    prog.global_block().append_op(
        type="cast",
        inputs={"X": i},
        outputs={"Out": o},
        attrs={
            "in_dtype": fluid.core.VarDesc.VarType.FP32,
            "out_dtype": fluid.core.VarDesc.VarType.FP16
        }
    )

def copy_to_master_param(p, block):
    v = block.vars.get(p.name, None)
    if v is None:
        raise ValueError("no param name %s found!" % p.name)
    new_p = fluid.framework.Parameter(
        block=block,
        shape=v.shape,
        dtype=fluid.core.VarDesc.VarType.FP32,
        type=v.type,
        lod_level=v.lod_level,
        stop_gradient=p.stop_gradient,
        trainable=p.trainable,
        optimize_attr=p.optimize_attr,
        regularizer=p.regularizer,
        gradient_clip_attr=p.gradient_clip_attr,
        error_clip=p.error_clip,
        name=v.name + ".master")
    return new_p

T
typhoonzero 已提交
47 48 49 50 51 52 53 54 55 56

def _update_role_var_grad(prog, params_grads):
    BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
    gradname_to_paramname = dict()
    for p, g in params_grads:
        gradname_to_paramname[g.name] = p.name
    for op in prog.global_block().ops:
        role = op.attr("op_role")
        if role & int(BACKWARD) and op.has_attr("op_role_var"):
            # have backward bits then remove all op_role_var
T
refine  
typhoonzero 已提交
57
            op.desc.remove_attr("op_role_var")
T
typhoonzero 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70
    for op in prog.global_block().ops:
        if op.type == "allreduce":
            allreduce_role_var = []
            for input_varname in op.input_arg_names:
                if input_varname in gradname_to_paramname:
                    allreduce_role_var.append(gradname_to_paramname[input_varname])
                    allreduce_role_var.append(input_varname)
            print("updating role var: ", allreduce_role_var)
            op._set_attr("op_role_var", allreduce_role_var)

def create_master_params_grads(params_grads, main_prog, startup_prog, scale_loss, reduce_master_grad=True):
    master_params_grads = []      # master p, g on local device
    params_grads_to_apply = []    # master p, g after allreduced, if reduce_master_grad is enabled
T
typhoonzero 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
    tmp_role = main_prog._current_role
    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
    main_prog._current_role = OpRole.Backward
    for p, g in params_grads:
        # create master parameters
        master_param = copy_to_master_param(p, main_prog.global_block())
        startup_master_param = startup_prog.global_block()._clone_variable(master_param)
        startup_p = startup_prog.global_block().var(p.name)
        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
        # cast fp16 gradients to fp32 before apply gradients
        if g.name.startswith("batch_norm"):
            if scale_loss > 1:
                scaled_g = g / float(scale_loss)
            else:
                scaled_g = g
            master_params_grads.append([p, scaled_g])
            continue
T
typhoonzero 已提交
88

T
typhoonzero 已提交
89 90 91
        master_grad = fluid.layers.cast(g, "float32")
        if scale_loss > 1:
            master_grad = master_grad / float(scale_loss)
T
typhoonzero 已提交
92 93 94 95 96 97
        master_params_grads.append([p, master_grad])
        if reduce_master_grad:
            reduced_master_grad = fluid.layers.collective._allreduce(master_grad)
        else:
            reduced_master_grad = master_grad
        params_grads_to_apply.append([master_param, reduced_master_grad])
T
refine  
typhoonzero 已提交
98
    
T
typhoonzero 已提交
99 100
    # update program op role var acording to master grads before allreduce.
    _update_role_var_grad(main_prog, master_params_grads)
T
typhoonzero 已提交
101
    main_prog._current_role = tmp_role
T
typhoonzero 已提交
102
    return params_grads_to_apply
T
typhoonzero 已提交
103 104 105 106 107 108 109 110

def master_param_to_train_param(master_params_grads, params_grads, main_prog):
    for idx, m_p_g in enumerate(master_params_grads):
        train_p, _ = params_grads[idx]
        if train_p.name.startswith("batch_norm"):
            continue
        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)