diff --git a/example/image-classification/utility.py b/example/image-classification/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..5b10a179ac2231cb26ab42993b7300d5e99f44bc --- /dev/null +++ b/example/image-classification/utility.py @@ -0,0 +1,63 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +import six +from paddle.fluid import core + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(six.iteritems(vars(args))): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/example/image-classification/utils/__init__.py b/example/image-classification/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4751caceeb14f0dddc937d90b4c953a870ffc3f8 --- /dev/null +++ b/example/image-classification/utils/__init__.py @@ -0,0 +1,2 @@ +from .learning_rate import cosine_decay, lr_warmup +from .fp16_utils import create_master_params_grads, master_param_to_train_param diff --git a/example/image-classification/utils/fp16_utils.py b/example/image-classification/utils/fp16_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aedf51aed3067a8fa8c010a08cbb18a1d198ac63 --- /dev/null +++ b/example/image-classification/utils/fp16_utils.py @@ -0,0 +1,83 @@ +from __future__ import print_function +import paddle +import paddle.fluid as fluid + + +def cast_fp16_to_fp32(i, o, prog): + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={ + "in_dtype": fluid.core.VarDesc.VarType.FP16, + "out_dtype": fluid.core.VarDesc.VarType.FP32 + }) + + +def cast_fp32_to_fp16(i, o, prog): + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={ + "in_dtype": fluid.core.VarDesc.VarType.FP32, + "out_dtype": fluid.core.VarDesc.VarType.FP16 + }) + + +def copy_to_master_param(p, block): + v = block.vars.get(p.name, None) + if v is None: + raise ValueError("no param name %s found!" % p.name) + new_p = fluid.framework.Parameter( + block=block, + shape=v.shape, + dtype=fluid.core.VarDesc.VarType.FP32, + type=v.type, + lod_level=v.lod_level, + stop_gradient=p.stop_gradient, + trainable=p.trainable, + optimize_attr=p.optimize_attr, + regularizer=p.regularizer, + gradient_clip_attr=p.gradient_clip_attr, + error_clip=p.error_clip, + name=v.name + ".master") + return new_p + + +def create_master_params_grads(params_grads, main_prog, startup_prog, + scale_loss): + master_params_grads = [] + tmp_role = main_prog._current_role + OpRole = fluid.core.op_proto_and_checker_maker.OpRole + main_prog._current_role = OpRole.Backward + for p, g in params_grads: + # create master parameters + master_param = copy_to_master_param(p, main_prog.global_block()) + startup_master_param = startup_prog.global_block()._clone_variable( + master_param) + startup_p = startup_prog.global_block().var(p.name) + cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) + # cast fp16 gradients to fp32 before apply gradients + if g.name.startswith("batch_norm"): + if scale_loss > 1: + scaled_g = g / float(scale_loss) + else: + scaled_g = g + master_params_grads.append([p, scaled_g]) + continue + master_grad = fluid.layers.cast(g, "float32") + if scale_loss > 1: + master_grad = master_grad / float(scale_loss) + master_params_grads.append([master_param, master_grad]) + main_prog._current_role = tmp_role + return master_params_grads + + +def master_param_to_train_param(master_params_grads, params_grads, main_prog): + for idx, m_p_g in enumerate(master_params_grads): + train_p, _ = params_grads[idx] + if train_p.name.startswith("batch_norm"): + continue + with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): + cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) diff --git a/example/image-classification/utils/learning_rate.py b/example/image-classification/utils/learning_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..299fdcf3f12511565403008fbc221d5699f8181c --- /dev/null +++ b/example/image-classification/utils/learning_rate.py @@ -0,0 +1,51 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers.ops as ops +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import math + + +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr + + +def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): + """ Applies linear learning rate warmup for distributed training + Argument learning_rate can be float or a Variable + lr = lr + (warmup_rate * step / warmup_steps) + """ + assert (isinstance(end_lr, float)) + assert (isinstance(start_lr, float)) + linear_step = end_lr - start_lr + with fluid.default_main_program()._lr_schedule_guard(): + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate_warmup") + + global_step = fluid.layers.learning_rate_scheduler._decay_step_counter() + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_steps): + decayed_lr = start_lr + linear_step * ( + global_step / warmup_steps) + fluid.layers.tensor.assign(decayed_lr, lr) + with switch.default(): + fluid.layers.tensor.assign(learning_rate, lr) + + return lr