diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 10b72524efd4a8f9174eab4f45e6173dc56f2c27..cdcf4cec7ea96279d04609d5c3e4229b78da775f 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -124,13 +124,13 @@ class CPUDenseMomentumFunctor { auto p = framework::EigenVector::Flatten(*param); auto v = framework::EigenVector::Flatten(*velocity); auto g = framework::EigenVector::Flatten(*grad); - auto* lr = learning_rate->data(); + const float* lr = learning_rate->data(); v_out = v * mu + g; if (use_nesterov) { - p_out = p - (g + v_out * mu) * lr[0]; + p_out = p - (g + v_out * mu) * static_cast(lr[0]); } else { - p_out = p - lr[0] * v_out; + p_out = p - static_cast(lr[0]) * v_out; } } }; @@ -147,7 +147,7 @@ class DenseMomentumFunctor { const T* p_; const T* g_; const T* v_; - const T* lr_; + const float* lr_; const T mu_; const int64_t num_; T* p_out_; @@ -155,8 +155,8 @@ class DenseMomentumFunctor { public: DenseMomentumFunctor(const T* p, const T* g, const T* v, - const T* learning_rate, const T mu, const int64_t num, - T* p_out, T* v_out) + const float* learning_rate, const T mu, + const int64_t num, T* p_out, T* v_out) : p_(p), g_(g), v_(v), @@ -169,10 +169,10 @@ class DenseMomentumFunctor { // put memory access in register const T p = p_[i]; const T g = g_[i]; - const T lr = lr_[0]; + const float lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; - T p_out = p - (g + v_out * mu_) * lr; + T p_out = p - (g + v_out * mu_) * static_cast(lr); // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; @@ -185,7 +185,7 @@ class DenseMomentumFunctor { const T* p_; const T* g_; const T* v_; - const T* lr_; + const float* lr_; const T mu_; const int64_t num_; T* p_out_; @@ -193,8 +193,8 @@ class DenseMomentumFunctor { public: DenseMomentumFunctor(const T* p, const T* g, const T* v, - const T* learning_rate, const T mu, const int64_t num, - T* p_out, T* v_out) + const float* learning_rate, const T mu, + const int64_t num, T* p_out, T* v_out) : p_(p), g_(g), v_(v), @@ -207,7 +207,7 @@ class DenseMomentumFunctor { // put memory access in register const T p = p_[i]; const T g = g_[i]; - const T lr = lr_[0]; + const T lr = static_cast(lr_[0]); const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - lr * v_out; @@ -226,7 +226,7 @@ class SparseMomentumFunctor { const T* p_; const T* g_; const T* v_; - const T* lr_; + const float* lr_; const T mu_; const int64_t* rows_; const int64_t row_numel_; @@ -235,7 +235,7 @@ class SparseMomentumFunctor { T* v_out_; public: - SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, + SparseMomentumFunctor(const T* p, const T* g, const T* v, const float* lr, const T mu, const int64_t* rows, int64_t row_numel, int64_t row_height, T* p_out, T* v_out) : p_(p), @@ -256,10 +256,10 @@ class SparseMomentumFunctor { : static_cast(0); // put memory access in register const T p = p_[i]; - const T lr = lr_[0]; + const float lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; - T p_out = p - (g + v_out * mu_) * lr; + T p_out = p - (g + v_out * mu_) * static_cast(lr); // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; @@ -272,7 +272,7 @@ class SparseMomentumFunctor { const T* p_; const T* g_; const T* v_; - const T* lr_; + const float* lr_; const T mu_; const int64_t* rows_; const int64_t row_numel_; @@ -281,7 +281,7 @@ class SparseMomentumFunctor { T* v_out_; public: - SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, + SparseMomentumFunctor(const T* p, const T* g, const T* v, const float* lr, const T mu, const int64_t* rows, int64_t row_numel, int64_t row_height, T* p_out, T* v_out) : p_(p), @@ -302,7 +302,7 @@ class SparseMomentumFunctor { : static_cast(0); // put memory access in register const T p = p_[i]; - const T lr = lr_[0]; + const T lr = static_cast(lr_[0]); const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - v_out * lr; @@ -342,7 +342,7 @@ class MomentumOpKernel : public framework::OpKernel { if (use_nesterov) { DenseMomentumFunctor functor( param->data(), grad->data(), velocity->data(), - learning_rate->data(), mu, param->numel(), + learning_rate->data(), mu, param->numel(), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); @@ -350,7 +350,7 @@ class MomentumOpKernel : public framework::OpKernel { } else { DenseMomentumFunctor functor( param->data(), grad->data(), velocity->data(), - learning_rate->data(), mu, param->numel(), + learning_rate->data(), mu, param->numel(), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); @@ -382,8 +382,8 @@ class MomentumOpKernel : public framework::OpKernel { if (use_nesterov) { SparseMomentumFunctor functor( param->data(), merged_grad->value().data(), - velocity->data(), learning_rate->data(), mu, rows, row_numel, - static_cast(merged_grad->rows().size()), + velocity->data(), learning_rate->data(), mu, rows, + row_numel, static_cast(merged_grad->rows().size()), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); @@ -391,8 +391,8 @@ class MomentumOpKernel : public framework::OpKernel { } else { SparseMomentumFunctor functor( param->data(), merged_grad->value().data(), - velocity->data(), learning_rate->data(), mu, rows, row_numel, - static_cast(merged_grad->rows().size()), + velocity->data(), learning_rate->data(), mu, rows, + row_numel, static_cast(merged_grad->rows().size()), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 0b142ff33de55f36410eb9c23cb75210fc9d6321..a345949270b3b94078249332ef3cf5ebcd63b896 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -16,11 +16,17 @@ from __future__ import print_function from ... import core from ... import layers +from ... import global_scope +from ...log_helper import get_logger +import logging +import numpy as np +_logger = get_logger( + __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') def _rename_arg(op, old_name, new_name): """ - If an op has old_name input and output, rename these input + If an op has old_name input and output, rename these input args new_name. Args: @@ -187,6 +193,124 @@ def _is_in_black_varnames(op, amp_lists): return False +def cast_net_to_fp16(program): + valid_types = [ + core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS, + core.VarDesc.VarType.LOD_TENSOR_ARRAY + ] + global_block = program.global_block() + + for block in program.blocks: + ops = block.ops + for op in ops: + for in_name in op.input_names: + if op.type == 'batch_norm' and in_name != 'X': + continue + for in_var_name in op.input(in_name): + in_var = None + try: + in_var = block.var(in_var_name) + except ValueError as e: + _logger.debug( + "-- {}, try to get it in the global block. --". + format(e)) + in_var = global_block.var(in_var_name) + if in_var is not None: + _logger.debug( + "-- var {} is got in the global block. --". + format(in_var_name)) + + if in_var is None or in_var.type not in valid_types: + continue + + if in_var.dtype == core.VarDesc.VarType.FP32: + in_var.desc.set_dtype(core.VarDesc.VarType.FP16) + + _logger.debug( + "-- op type: {}, in var name: {}, in var dtype: {} --". + format(op.type, in_var_name, in_var.dtype)) + + for out_name in op.output_names: + if op.type == 'batch_norm' and out_name != 'Y': + continue + for out_var_name in op.output(out_name): + out_var = None + try: + out_var = block.var(out_var_name) + except ValueError as e: + _logger.debug( + "-- {}, try to get it in the global block. --". + format(e)) + out_var = global_block.var(out_var_name) + if out_var is not None: + _logger.debug( + "-- var {} is got in the global block. --". + format(out_var_name)) + + if out_var is None or out_var.type not in valid_types: + continue + + if out_var.dtype == core.VarDesc.VarType.FP32: + out_var.desc.set_dtype(core.VarDesc.VarType.FP16) + + _logger.debug( + "-- op type: {}, out var name: {}, out var dtype: {} --". + format(op.type, out_var_name, out_var.dtype)) + if op.has_attr('in_dtype') and op.attr( + 'in_dtype') == core.VarDesc.VarType.FP32: + op._set_attr('in_dtype', core.VarDesc.VarType.FP16) + if op.has_attr('out_dtype') and op.attr( + 'out_dtype') == core.VarDesc.VarType.FP32: + op._set_attr('out_dtype', core.VarDesc.VarType.FP16) + if op.has_attr('dtype') and op.attr( + 'dtype') == core.VarDesc.VarType.FP32: + op._set_attr('dtype', core.VarDesc.VarType.FP16) + + +def cast_parameters_to_fp16(exe, program): + global_block = program.global_block() + all_parameters = global_block.all_parameters() + for param in all_parameters: + if not (param.name.find('bn') != -1 and + (param.name.endswith('_offset') or param.name.endswith('_mean') + or param.name.endswith('_scale') or + param.name.endswith('_variance'))): + param_t = global_scope().find_var(param.name).get_tensor() + data = np.array(param_t) + param_t.set(np.float16(data), exe.place) + + +# def cast_parameters_to_fp16(program): +# global_block = program.global_block() +# all_parameters = global_block.all_parameters() +# is_bn_params = lambda param: (param.name.find('bn') != -1 and (param.name.endswith('_offset') or param.name.endswith('_mean') or param.name.endswith('_scale') or param.name.endswith('_variance'))) +# all_param_names = {p.name for p in all_parameters if not is_bn_params(p)} +# ops = global_block.ops + +# for param in all_parameters: +# if param.name in all_param_names: +# param_var = global_block.var(param.name) +# if param_var.dtype == core.VarDesc.VarType.FP32: +# param_var.desc.set_dtype(core.VarDesc.VarType.FP16) + +# for op in ops: +# target_op = False +# for out_name in op.output_names: +# for out_var_name in op.output(out_name): +# if out_var_name in all_param_names: +# target_op = True +# if target_op: +# if op.has_attr('in_dtype') and op.attr( +# 'in_dtype') == core.VarDesc.VarType.FP32: +# op._set_attr('in_dtype', core.VarDesc.VarType.FP16) +# if op.has_attr('out_dtype') and op.attr( +# 'out_dtype') == core.VarDesc.VarType.FP32: +# op._set_attr('out_dtype', core.VarDesc.VarType.FP16) +# if op.has_attr('dtype') and op.attr( +# 'dtype') == core.VarDesc.VarType.FP32: +# op._set_attr('dtype', core.VarDesc.VarType.FP16) + + def rewrite_program(main_prog, amp_lists): """ Traverse all ops in current block and insert cast op according to