diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index e07f6ce8ab70a4bbdc638b45e7caa8490c61eeca..5a4b94a8d41b6b5953a50919037d033430e908c5 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -18,7 +18,7 @@ from ... import layers from ... import unique_name from . import fp16_utils from .fp16_utils import create_master_params_grads, master_param_to_train_param -from .fp16_utils import update_loss_scaling +from .fp16_utils import update_loss_scaling, rewrite_program __all__ = ["decorate"] @@ -120,6 +120,7 @@ class OptimizerWithMixedPrecison(object): A list of (param, grad), which is a tuple of a parameter and its gradient respectively, and the scaled loss. """ + rewrite_program(self._train_program) scaled_loss = loss * self._loss_scaling self._param_grads = self._optimizer.backward( scaled_loss, startup_program, parameter_list, no_grad_set, diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py new file mode 100644 index 0000000000000000000000000000000000000000..59bc0dc4be6c0f191fb8d3cbf519bd298e31d687 --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -0,0 +1,234 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The three sets listed below are changed dynamiclly. They don't contain all +# paddle ops currently. + +# The set of ops that support fp16 calculation and are considered numerically- +# safe and performance-critical. These ops are always converted to fp16. +white_list = { + 'conv2d', + 'matmul', + 'mul', +} + +# The set of ops that support fp16 calculation and are considered numerically- +# dangerous and whose effects may also be observed in downstream ops. +black_list = { + 'exp', + 'square', + 'log', + 'mean', + 'sum', + 'cos_sim', + 'softmax', + 'softmax_with_cross_entropy', + 'sigmoid_cross_entropy_with_logits', + 'cross_entropy', + 'cross_entropy2', +} + +# This set contains two types of ops. All ops supported fp16 calculation. One +# of two types is considered numerically-safe, but may be made unsafe by an +# updtream blacklist op. Another type do not have numerically-significant +# effects, like stack, flatten2. +gray_list = { + 'elementwise_add', + 'elementwise_sub', + 'elementwise_mul', + 'elementwise_div', + 'elementwise_max', + 'elementwise_min', + 'elementwise_pow', + 'elementwise_mod', + 'elementwise_floordiv', + 'tanh', + 'sigmoid', + 'lookup_table', + 'top_k', + 'pool2d', + 'pool3d', + 'dropout', + 'relu', + 'relu6', + 'leaky_relu', + 'soft_relu', + 'flatten2', + 'stack', + 'unstack', + 'uniform_random_batch_size_like', + 'gaussian_random', + 'gaussian_random_batch_size_like', + 'slice', + 'rank', + 'scale', + 'transpose2', + 'reshape2', + 'gather', + 'fill_constant', + 'get_tensor_from_selected_rows', + 'sign', + 'cast', +} +''' +# The set of ops that don't support fp16 calculation +unsupported_fp16_list = { + # from python/paddle/fluid/layers/io.py + 'send', + 'send_barrier', + 'recv', + 'fetch_barrier', + 'create_recordio_file_reader', + 'create_random_data_generator', + 'create_py_reader', + 'create_shuffle_reader', + 'create_batch_reader', + 'create_double_buffer_reader', + 'create_multi_pass_reader', + 'read', + 'load', + + # from python/paddle/fluid/control_flow.py + 'increment', + 'less_than', + 'less_equal', + 'greater_than', + 'greater_equal', + 'equal', + 'not_equal', + 'read_from_array', + 'shrink_rnn_memory', + 'lod_array_length', + 'logical_and', + 'logical_or', + 'logical_xor', + 'logical_not', + 'print', + 'conditional_block', + 'while', + 'ifelse', + 'is_empty', + + 'lstm', + 'cudnn_lstm', + 'lstmp', + 'gru', + 'gru_unit', + 'linear_chain_crf', + 'crf_decoding', + 'bpr_loss', + 'chunk_eval', + 'sequence_conv', + 'sequence_softmax', + # Depthwise conv2d isn't fast and safe currently. + # ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79 + 'depthwise_conv2d', + # Tensor Core kernels are not available for 3D convolutions currently. + 'conv3d', + 'sequence_pool', + 'sequence_concat', + 'sequence_slice', + 'data_norm', + 'layer_norm', + 'group_norm', + 'spectral_norm', + 'depthwise_conv2d_transpose', + 'sequence_expand', + 'conv_transposed2d', + 'conv_transposed3d', + 'sequence_expand_as', + 'sequence_pad', + 'sequence_unpad', + 'sequence_erase', + 'beam_search', + 'beam_search_decode', + 'lstm_unit', + 'reduce_sum', + 'reduce_mean', + 'reduce_max', + 'reduce_min', + 'reduce_prod', + 'reduce_all', + 'reduce_any', + 'split', + 'edit_distance', + 'ctc_align', + 'warpctc', + 'sequence_reshape', + 'nce', + 'hierarchical_sigmoid', + 'im2sequence', + 'row_conv', + 'multiplex', + 'sample_logits', + 'one_hot', + 'smooth_l1_loss', + 'squeeze2', + 'unsqueeze2', + 'lod_reset', + 'lrn', + 'pad', + 'pad_constant_like', + 'label_smooth', + 'scatter', + 'sequence_scatter', + 'random_crop', + 'mean_iou', + 'selu', + 'crop', + 'affine_grid', + 'rank_loss', + 'margin_rank_loss', + 'pad2d', + 'elu', + 'pow', + 'stanh', + 'hard_sigmoid', + 'swish', + 'prelu', + 'brelu', + 'sequence_enumerate', + 'sequence_mask', + 'expand', + 'sampling_id', + 'maxout', + 'space_to_depth', + 'sequence_reverse', + 'similarity_focus', + 'hash', + 'grid_sampler', + 'log_loss', + 'teacher_student_sigmoid_loss', + 'add_position_encoding', + 'bilinear_tensor_product', + 'shuffle_channel', + 'temporal_shift', + 'psroi_pool', + 'huber_loss', + 'kldiv_loss', + 'tree_conv', + 'pixel_shuffle', + 'fsp', + 'cvm', + + 'affine_channel', + 'roi_pool', + 'roi_align', + 'anchor_generator', + 'generate_proposals', + 'generate_proposal_labels', + 'generate_mask_labels', + +} +''' diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 3445cdbcbb496918400e5c56104f4edb9ef19a0b..a3ca946cf4c66e275ea314e9d5988a3ddc93a627 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -17,6 +17,7 @@ from __future__ import print_function from ... import core from ... import layers from ... import framework +from .fp16_lists import black_list, white_list, gray_list def append_cast_op(i, o, prog): @@ -121,6 +122,183 @@ def master_param_to_train_param(master_params_grads, params_grads, main_prog): append_cast_op(m_p_g[0], train_p, main_prog) +def _rename_arg(op, old_name, new_name): + """ + If an op has old_name input and output, rename these input + args new_name. + + Args: + op (Operator): Current operator. + old_name (str): The old name of input args. + new_name (str): The new name of input args. + """ + op_desc = op.desc + if isinstance(op_desc, tuple): + op_desc = op_desc[0] + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) + + +def _dtype_to_str(dtype): + """ + Convert specific variable type to its corresponding string. + + Args: + dtype (VarType): Variable type. + """ + if dtype == core.VarDesc.VarType.FP16: + return 'fp16' + else: + return 'fp32' + + +def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): + """ + Insert cast op and rename args of input and output. + + Args: + block (Program): The block in which the operator is. + op (Operator): The operator to insert cast op. + idx (int): The index of current operator. + src_dtype (VarType): The input variable dtype of cast op. + desr_dtype (VarType): The output variable dtype of cast op. + + Returns: + num_cast_op (int): The number of cast ops that have been inserted. + """ + num_cast_ops = 0 + valid_types = [ + core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS, + core.VarDesc.VarType.LOD_TENSOR_ARRAY + ] + for in_name in op.input_names: + for in_var_name in op.input(in_name): + in_var = block.var(in_var_name) + if in_var.type not in valid_types: + continue + if in_var.dtype == src_dtype: + out_var = block.create_var( + name=in_var.name + \ + '.cast_' + _dtype_to_str(dest_dtype), + dtype=dest_dtype, + persistable=False, + stop_gradient=False) + block._insert_op( + idx, + type="cast", + inputs={"X": in_var}, + outputs={"Out": out_var}, + attrs={ + "in_dtype": in_var.dtype, + "out_dtype": out_var.dtype + }) + num_cast_ops += 1 + _rename_arg(op, in_var.name, out_var.name) + else: + if op.has_attr('in_dtype'): + op._set_attr('in_dtype', dest_dtype) + if src_dtype == core.VarDesc.VarType.FP16: + for out_name in op.output_names: + for out_var_name in op.output(out_name): + out_var = block.var(out_var_name) + if out_var.type not in valid_types: + continue + if out_var.dtype == core.VarDesc.VarType.FP16: + out_var.desc.set_dtype(core.VarDesc.VarType.FP32) + if op.has_attr('out_dtype'): + op._set_attr('out_dtype', core.VarDesc.VarType.FP32) + return num_cast_ops + + +def find_true_prev_op(ops, var_name): + for op in ops: + for out_name in op.output_names: + for out_var_name in op.output(out_name): + if out_var_name == var_name: + return op + + +def rewrite_program(main_prog): + """ + Traverse all ops in current block and insert cast op according to + which set current op belongs to. + + 1. When an op belongs to the black list, add it to black set + 2. When an op belongs to the white list, add it to white set + 3. When an op belongs to the gray list. If one + of its inputs is the output of black set op or black list op, + add it to black set. If all of its previous ops are not black + op and one of its inputs is the output of white set op or + white list op, add it to white set. + 4. When an op isn't in the lists, add it to black op set. + 5. Add necessary cast ops to make sure that black set op will be + computed in fp32 mode, while white set op will be computed in + fp16 mode. + + Args: + main_prog (Program): The main program for training. + """ + block = main_prog.global_block() + ops = block.ops + white_op_set = set() + black_op_set = set() + for i in range(len(ops)): + op = ops[i] + if op.type in black_list: + black_op_set.add(op) + elif op.type in white_list: + white_op_set.add(op) + elif op.type in op.type in gray_list: + is_black_op = False + is_white_op = False + for in_name in op.input_names: + # if this op has inputs + if in_name: + for in_var_name in op.input(in_name): + in_var = block.var(in_var_name) + # this in_var isn't the output of other op + if in_var.op is None: + continue + if in_var.op is op: + prev_op = find_true_prev_op(ops, in_var_name) + else: + prev_op = in_var.op + # if it's one of inputs + if prev_op in black_op_set or \ + prev_op.type in black_list: + is_black_op = True + if prev_op in white_op_set or \ + prev_op.type in white_list: + is_white_op = True + if is_black_op: + black_op_set.add(op) + elif is_white_op: + white_op_set.add(op) + else: + pass + else: + # For numerical safe, we apply fp32 computation on ops that + # are not determined which list they should stay. + black_op_set.add(op) + + idx = 0 + while idx < len(ops): + op = ops[idx] + num_cast_ops = 0 + if op in black_op_set: + num_cast_ops = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.FP16, + core.VarDesc.VarType.FP32) + elif op in white_op_set: + num_cast_ops = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.FP32, + core.VarDesc.VarType.FP16) + else: + pass + + idx += num_cast_ops + 1 + + def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, num_bad_steps, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio):