# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from __future__ import absolute_import import logging import re import numpy as np import paddle.fluid as F import paddle.fluid.layers as L import paddle.fluid.dygraph as D log = logging.getLogger(__name__) def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): """ Applies linear warmup of learning rate from 0 and decay to 0.""" with F.default_main_program()._lr_schedule_guard(): lr = L.tensor.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="scheduled_learning_rate") global_step = L.learning_rate_scheduler._decay_step_counter() warmup_lr = learning_rate * (global_step / warmup_steps) poly_decay_lr = L.learning_rate_scheduler.polynomial_decay( learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # decayed_lr = L.elementwise_min(warmup_lr, poly_decay_lr) L.assign(decayed_lr, lr) return lr def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, init_loss_scaling=128, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8): """do backword for static""" def exclude_from_weight_decay(param): name = param.name.rstrip('.master') if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = L.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") log.debug('using Adam') optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) else: scheduled_lr = L.create_global_var( name=F.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) log.debug('using Adam') optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) optimizer._learning_rate_map[F.default_main_program( )] = scheduled_lr if use_fp16: log.info('AMP activated') optimizer = F.contrib.mixed_precision.decorate(optimizer, amp_lists=F.contrib.mixed_precision.AutoMixedPrecisionLists(custom_black_varnames={"loss"}, custom_black_list={'layer_norm', 'arg_max', 'argmax'}), init_loss_scaling=init_loss_scaling, use_dynamic_loss_scaling=True, ) loss_scaling = optimizer.get_loss_scaling() else: loss_scaling = None F.clip.set_gradient_clip( clip=F.clip.GradientClipByGlobalNorm(clip_norm=1.0)) param_list = {} for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), F.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr L.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling class AdamW(F.optimizer.AdamOptimizer): """AdamW object for dygraph""" def __init__(self, *args, **kwargs): weight_decay = kwargs.pop('weight_decay', None) var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0') super(AdamW, self).__init__(*args, **kwargs) self.wd = weight_decay self.pat = re.compile(var_name_to_exclude) def apply_optimize(self, loss, startup_program, params_grads): super(AdamW, self).apply_optimize(loss, startup_program, params_grads) for p, g in params_grads: #log.debug(L.reduce_mean(p)) if not self.pat.match(p.name): L.assign(p * (1. - self.wd * self.current_step_lr()), p) #log.debug(L.reduce_mean(p)) class LinearDecay(D.learning_rate_scheduler.LearningRateDecay): def __init__(self, learning_rate, warmup_steps, decay_steps, end_learning_rate=0, power=1.0, cycle=False, begin=0, step=1, dtype='float32'): super(LinearDecay, self).__init__(begin, step, dtype) self.learning_rate = learning_rate self.warmup_steps = warmup_steps self.decay_steps = decay_steps self.end_learning_rate = end_learning_rate self.power = power self.cycle = cycle def step(self): if self.step_num < self.warmup_steps: decayed_lr = self.learning_rate * (self.step_num / self.warmup_steps) decayed_lr = self.create_lr_var(decayed_lr) else: tmp_step_num = self.step_num tmp_decay_steps = self.decay_steps if self.cycle: div_res = fluid.layers.ceil( self.create_lr_var(tmp_step_num / float(self.decay_steps))) if tmp_step_num == 0: div_res = self.create_lr_var(1.0) tmp_decay_steps = self.decay_steps * div_res else: tmp_step_num = self.create_lr_var( tmp_step_num if tmp_step_num < self.decay_steps else self.decay_steps) decayed_lr = (self.learning_rate - self.end_learning_rate) * \ ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate return decayed_lr