diff --git a/examples/bert/bert_classifier.py b/examples/bert/bert_classifier.py index c55eb2554d9074b03135c42aa746fa6c7a33bf27..472e39b3436bfe9c32a6941bcfeeac6badb82e5f 100644 --- a/examples/bert/bert_classifier.py +++ b/examples/bert/bert_classifier.py @@ -18,10 +18,10 @@ from hapi.metrics import Accuracy from hapi.configure import Config from hapi.text.bert import BertEncoder from paddle.fluid.dygraph import Linear, Layer -from hapi.model import set_device, Model, Input from hapi.loss import SoftmaxWithCrossEntropy +from hapi.model import set_device, Model, Input import hapi.text.tokenizer.tokenization as tokenization -from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample +from hapi.text.bert import BertConfig, BertDataLoader, BertInputExample, make_optimizer class ClsModelLayer(Model): @@ -128,7 +128,7 @@ def main(): [None, None], 'int64', name='src_ids'), Input( [None, None], 'int64', name='pos_ids'), Input( [None, None], 'int64', name='sent_ids'), Input( - [None, None], 'float32', name='input_mask') + [None, None, 1], 'float32', name='input_mask') ] labels = [Input([None, 1], 'int64', name='label')] @@ -139,13 +139,13 @@ def main(): len(["contradiction", "entailment", "neutral"]), return_pooled_out=True) - optimizer = Optimizer( + optimizer = make_optimizer( warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=config.learning_rate, - model_cls=cls_model, weight_decay=config.weight_decay, scheduler=config.lr_scheduler, + model=cls_model, loss_scaling=config.loss_scaling, parameter_list=cls_model.parameters()) @@ -157,8 +157,7 @@ def main(): labels, device=device) - cls_model.bert_layer.init_parameters( - config.init_pretraining_params, verbose=config.verbose) + cls_model.bert_layer.load("./bert_small", reset_optimizer=True) # do train cls_model.fit(train_data=train_dataloader.dataloader, diff --git a/examples/bert/run_classifier_single_gpu.sh b/examples/bert/run_classifier_single_gpu.sh index 5b52aafd0a63dfb250c7ab7dcefc09b60f406ac2..16ca7230e8706db4210cc5ff7d0467cdf4007c0f 100755 --- a/examples/bert/run_classifier_single_gpu.sh +++ b/examples/bert/run_classifier_single_gpu.sh @@ -4,7 +4,7 @@ TASK_NAME='MNLI' DATA_PATH="./data/glue_data/MNLI/" CKPT_PATH="./data/saved_model/mnli_models" -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=1 # start fine-tuning python3.7 bert_classifier.py\ diff --git a/examples/bert_leveldb/bert_classifier.py b/examples/bert_leveldb/bert_classifier.py index 624e49c4d8c44d05c52f2e79a65dd8399a5b9f4c..012c42eba4c9be598e7cb7bd3e4b99c0e3f17f5f 100644 --- a/examples/bert_leveldb/bert_classifier.py +++ b/examples/bert_leveldb/bert_classifier.py @@ -18,10 +18,10 @@ from hapi.metrics import Accuracy from hapi.configure import Config from hapi.text.bert import BertEncoder from paddle.fluid.dygraph import Linear, Layer -from hapi.model import set_device, Model, Input from hapi.loss import SoftmaxWithCrossEntropy +from hapi.model import set_device, Model, Input import hapi.text.tokenizer.tokenization as tokenization -from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample +from hapi.text.bert import BertConfig, BertDataLoader, BertInputExample, make_optimizer class ClsModelLayer(Model): @@ -99,12 +99,12 @@ def main(): train_dataloader = BertDataLoader( "./data/glue_data/MNLI/train.tsv", - tokenizer, ["contradiction", "entailment", "neutral"], + tokenizer, + ["contradiction", "entailment", "neutral"], max_seq_length=config.max_seq_len, batch_size=config.batch_size, line_processor=mnli_line_processor, - mode="leveldb", - phase="train") + mode="leveldb", ) test_dataloader = BertDataLoader( "./data/glue_data/MNLI/dev_matched.tsv", @@ -130,7 +130,7 @@ def main(): [None, None], 'int64', name='src_ids'), Input( [None, None], 'int64', name='pos_ids'), Input( [None, None], 'int64', name='sent_ids'), Input( - [None, None], 'float32', name='input_mask') + [None, None, 1], 'float32', name='input_mask') ] labels = [Input([None, 1], 'int64', name='label')] @@ -141,13 +141,13 @@ def main(): len(["contradiction", "entailment", "neutral"]), return_pooled_out=True) - optimizer = Optimizer( + optimizer = make_optimizer( warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=config.learning_rate, - model_cls=cls_model, weight_decay=config.weight_decay, scheduler=config.lr_scheduler, + model=cls_model, loss_scaling=config.loss_scaling, parameter_list=cls_model.parameters()) @@ -159,8 +159,7 @@ def main(): labels, device=device) - cls_model.bert_layer.init_parameters( - config.init_pretraining_params, verbose=config.verbose) + cls_model.bert_layer.load("./bert_small", reset_optimizer=True) # do train cls_model.fit(train_data=train_dataloader.dataloader, diff --git a/examples/bert_leveldb/run_classifier_multi_gpu.sh b/examples/bert_leveldb/run_classifier_multi_gpu.sh index 1b7d6aea60c385e32bafbcfd35ae420f1e5824a6..7d545fe09d0fd2f540b08754caf408fe2f22de56 100755 --- a/examples/bert_leveldb/run_classifier_multi_gpu.sh +++ b/examples/bert_leveldb/run_classifier_multi_gpu.sh @@ -5,7 +5,7 @@ DATA_PATH="./data/glue_data/MNLI/" CKPT_PATH="./data/saved_model/mnli_models" # start fine-tuning -python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\ +python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=1,2,3 bert_classifier.py\ --use_cuda true \ --do_train true \ --do_test true \ diff --git a/examples/bert_leveldb/run_classifier_single_gpu.sh b/examples/bert_leveldb/run_classifier_single_gpu.sh index 5b52aafd0a63dfb250c7ab7dcefc09b60f406ac2..16ca7230e8706db4210cc5ff7d0467cdf4007c0f 100755 --- a/examples/bert_leveldb/run_classifier_single_gpu.sh +++ b/examples/bert_leveldb/run_classifier_single_gpu.sh @@ -4,7 +4,7 @@ TASK_NAME='MNLI' DATA_PATH="./data/glue_data/MNLI/" CKPT_PATH="./data/saved_model/mnli_models" -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=1 # start fine-tuning python3.7 bert_classifier.py\ diff --git a/hapi/text/bert/__init__.py b/hapi/text/bert/__init__.py index cd1332dfe97a5636ef6a0a855fa4931ba5903688..b634f9a6adce27d5dcd4d552799c4e0771d8950d 100644 --- a/hapi/text/bert/__init__.py +++ b/hapi/text/bert/__init__.py @@ -13,7 +13,9 @@ # limitations under the License. from hapi.text.bert.bert import BertConfig as BertConfig -from hapi.text.bert.optimization import Optimizer as Optimizer +from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer +from hapi.text.bert.static_optimization import StOptimizer as StOptimizer +from hapi.text.bert.optimization import make_optimizer as make_optimizer from hapi.text.bert.dataloader import BertDataLoader as BertDataLoader from hapi.text.bert.dataloader import BertInputExample as BertInputExample from hapi.text.tokenizer import tokenization as tokenization diff --git a/hapi/text/bert/bert.py b/hapi/text/bert/bert.py index fdf17ac100b7d9b3b4cecee2f9d3af9fc3d1ca84..be0203d9c8ac435dd8ec6a225bc71ad8b121b91d 100644 --- a/hapi/text/bert/bert.py +++ b/hapi/text/bert/bert.py @@ -23,8 +23,8 @@ import numpy as np import paddle import paddle.fluid as fluid +from hapi.model import Model from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard - from hapi.text.text import PrePostProcessLayer, TransformerEncoder from hapi.text.bert.utils.init import init_from_static_model @@ -52,7 +52,7 @@ class BertConfig(object): print('------------------------------------------------') -class BertEncoder(Layer): +class BertEncoder(Model): """ bert """ diff --git a/hapi/text/bert/dygraph_optimization.py b/hapi/text/bert/dygraph_optimization.py new file mode 100755 index 0000000000000000000000000000000000000000..af84f3cce8139fd08e7e3506b8f828f6a213fdd7 --- /dev/null +++ b/hapi/text/bert/dygraph_optimization.py @@ -0,0 +1,182 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimization and learning rate scheduling.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle.fluid as fluid + +from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay + + +class ConstantLR(LearningRateDecay): + def __init__(self, learning_rate, begin=0, step=1, dtype='float32'): + super(ConstantLR, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + + def step(self): + return self.learning_rate + + +class LinearDecay(LearningRateDecay): + def __init__(self, + learning_rate, + warmup_steps, + decay_steps, + end_learning_rate=0.0001, + power=1.0, + cycle=False, + begin=0, + step=1, + dtype='float32'): + super(LinearDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.warmup_steps = warmup_steps + self.decay_steps = decay_steps + self.end_learning_rate = end_learning_rate + self.power = power + self.cycle = cycle + + def step(self): + if self.step_num < self.warmup_steps: + decayed_lr = self.learning_rate * (self.step_num / + self.warmup_steps) + decayed_lr = self.create_lr_var(decayed_lr) + else: + tmp_step_num = self.step_num + tmp_decay_steps = self.decay_steps + if self.cycle: + div_res = fluid.layers.ceil( + self.create_lr_var(tmp_step_num / float(self.decay_steps))) + if tmp_step_num == 0: + div_res = self.create_lr_var(1.0) + tmp_decay_steps = self.decay_steps * div_res + else: + tmp_step_num = self.create_lr_var( + tmp_step_num + if tmp_step_num < self.decay_steps else self.decay_steps) + decayed_lr = (self.learning_rate - self.end_learning_rate) * \ + ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate + + return decayed_lr + + +class DyOptimizer(object): + def __init__(self, + warmup_steps, + num_train_steps, + learning_rate, + model_cls, + weight_decay, + scheduler='linear_warmup_decay', + loss_scaling=1.0, + parameter_list=None): + self.warmup_steps = warmup_steps + self.num_train_steps = num_train_steps + self.learning_rate = learning_rate + self.model_cls = model_cls + self.weight_decay = weight_decay + self.scheduler = scheduler + self.loss_scaling = loss_scaling + self.parameter_list = parameter_list + + self.scheduled_lr = 0.0 + self.optimizer = self.lr_schedule() + + def lr_schedule(self): + if self.warmup_steps > 0: + if self.scheduler == 'noam_decay': + self.scheduled_lr = fluid.dygraph.NoamDecay(1 / ( + self.warmup_steps * (self.learning_rate**2)), + self.warmup_steps) + elif self.scheduler == 'linear_warmup_decay': + self.scheduled_lr = LinearDecay(self.learning_rate, + self.warmup_steps, + self.num_train_steps, 0.0) + else: + raise ValueError("Unkown learning rate scheduler, should be " + "'noam_decay' or 'linear_warmup_decay'") + optimizer = fluid.optimizer.Adam( + learning_rate=self.scheduled_lr, + parameter_list=self.parameter_list) + else: + self.scheduled_lr = ConstantLR(self.learning_rate) + optimizer = fluid.optimizer.Adam( + learning_rate=self.scheduled_lr, + parameter_list=self.parameter_list) + + return optimizer + + def exclude_from_weight_decay(self, name): + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + def state_dict(self): + return self.optimizer.state_dict() + + def set_dict(self, state_dict): + return self.optimizer.set_dict(state_dict) + + def get_opti_var_name_list(self): + return self.optimizer.get_opti_var_name_list() + + def current_step_lr(self): + return self.optimizer.current_step_lr() + + def minimize(self, loss, use_data_parallel=False, model=None): + param_list = dict() + + clip_norm_thres = 1.0 + #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres) + + if use_data_parallel: + loss = model.scale_loss(loss) + + loss.backward() + + if self.weight_decay > 0: + for param in self.model_cls.parameters(): + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + if use_data_parallel: + assert model is not None + model.apply_collective_grads() + + #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip) + _, param_grads = self.optimizer.minimize(loss) + + if self.weight_decay > 0: + for param, grad in param_grads: + if self.exclude_from_weight_decay(param.name): + continue + if isinstance(self.scheduled_lr.step(), float): + updated_param = param.numpy() - param_list[ + param.name].numpy( + ) * self.weight_decay * self.scheduled_lr.step() + else: + updated_param = param.numpy( + ) - param_list[param.name].numpy( + ) * self.weight_decay * self.scheduled_lr.step().numpy() + updated_param_var = fluid.dygraph.to_variable(updated_param) + param = updated_param_var + #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape)) diff --git a/hapi/text/bert/optimization.py b/hapi/text/bert/optimization.py old mode 100755 new mode 100644 index b2ba8f65a744754e8ff96ca66ccf818bc8b06c34..d9f8d277ca9d64486aa3304ffb91657da287a5f6 --- a/hapi/text/bert/optimization.py +++ b/hapi/text/bert/optimization.py @@ -11,172 +11,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Optimization and learning rate scheduling.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle.fluid as fluid - -from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay - - -class ConstantLR(LearningRateDecay): - def __init__(self, learning_rate, begin=0, step=1, dtype='float32'): - super(ConstantLR, self).__init__(begin, step, dtype) - self.learning_rate = learning_rate - - def step(self): - return self.learning_rate - - -class LinearDecay(LearningRateDecay): - def __init__(self, - learning_rate, - warmup_steps, - decay_steps, - end_learning_rate=0.0001, - power=1.0, - cycle=False, - begin=0, - step=1, - dtype='float32'): - super(LinearDecay, self).__init__(begin, step, dtype) - self.learning_rate = learning_rate - self.warmup_steps = warmup_steps - self.decay_steps = decay_steps - self.end_learning_rate = end_learning_rate - self.power = power - self.cycle = cycle - - def step(self): - if self.step_num < self.warmup_steps: - decayed_lr = self.learning_rate * (self.step_num / - self.warmup_steps) - decayed_lr = self.create_lr_var(decayed_lr) - else: - tmp_step_num = self.step_num - tmp_decay_steps = self.decay_steps - if self.cycle: - div_res = fluid.layers.ceil( - self.create_lr_var(tmp_step_num / float(self.decay_steps))) - if tmp_step_num == 0: - div_res = self.create_lr_var(1.0) - tmp_decay_steps = self.decay_steps * div_res - else: - tmp_step_num = self.create_lr_var( - tmp_step_num - if tmp_step_num < self.decay_steps else self.decay_steps) - decayed_lr = (self.learning_rate - self.end_learning_rate) * \ - ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate - - return decayed_lr - - -class Optimizer(object): - def __init__(self, - warmup_steps, - num_train_steps, - learning_rate, - model_cls, - weight_decay, - scheduler='linear_warmup_decay', - loss_scaling=1.0, - parameter_list=None): - self.warmup_steps = warmup_steps - self.num_train_steps = num_train_steps - self.learning_rate = learning_rate - self.model_cls = model_cls - self.weight_decay = weight_decay - self.scheduler = scheduler - self.loss_scaling = loss_scaling - self.parameter_list = parameter_list - - self.scheduled_lr = 0.0 - self.optimizer = self.lr_schedule() - - def lr_schedule(self): - if self.warmup_steps > 0: - if self.scheduler == 'noam_decay': - self.scheduled_lr = fluid.dygraph.NoamDecay(1 / ( - self.warmup_steps * (self.learning_rate**2)), - self.warmup_steps) - elif self.scheduler == 'linear_warmup_decay': - self.scheduled_lr = LinearDecay(self.learning_rate, - self.warmup_steps, - self.num_train_steps, 0.0) - else: - raise ValueError("Unkown learning rate scheduler, should be " - "'noam_decay' or 'linear_warmup_decay'") - optimizer = fluid.optimizer.Adam( - learning_rate=self.scheduled_lr, - parameter_list=self.parameter_list) - else: - self.scheduled_lr = ConstantLR(self.learning_rate) - optimizer = fluid.optimizer.Adam( - learning_rate=self.scheduled_lr, - parameter_list=self.parameter_list) - - return optimizer - - def exclude_from_weight_decay(self, name): - if name.find("layer_norm") > -1: - return True - bias_suffix = ["_bias", "_b", ".b_0"] - for suffix in bias_suffix: - if name.endswith(suffix): - return True - return False - - def state_dict(self): - return self.optimizer.state_dict() - - def set_dict(self, state_dict): - return self.optimizer.set_dict(state_dict) - - def get_opti_var_name_list(self): - return self.optimizer.get_opti_var_name_list() - - def current_step_lr(self): - return self.optimizer.current_step_lr() - - def minimize(self, loss, use_data_parallel=False, model=None): - param_list = dict() - - clip_norm_thres = 1.0 - #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres) - - if use_data_parallel: - loss = model.scale_loss(loss) - - loss.backward() - - if self.weight_decay > 0: - for param in self.model_cls.parameters(): - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True - - if use_data_parallel: - assert model is not None - model.apply_collective_grads() - - #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip) - _, param_grads = self.optimizer.minimize(loss) - - if self.weight_decay > 0: - for param, grad in param_grads: - if self.exclude_from_weight_decay(param.name): - continue - if isinstance(self.scheduled_lr.step(), float): - updated_param = param.numpy() - param_list[ - param.name].numpy( - ) * self.weight_decay * self.scheduled_lr.step() - else: - updated_param = param.numpy( - ) - param_list[param.name].numpy( - ) * self.weight_decay * self.scheduled_lr.step().numpy() - updated_param_var = fluid.dygraph.to_variable(updated_param) - param = updated_param_var - #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape)) +from paddle.fluid.framework import in_dygraph_mode +from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer +from hapi.text.bert.static_optimization import StOptimizer as StOptimizer + + +def make_optimizer(warmup_steps, + num_train_steps, + learning_rate, + weight_decay, + model, + scheduler='linear_warmup_decay', + loss_scaling=1.0, + parameter_list=None): + + if in_dygraph_mode(): + return DyOptimizer( + warmup_steps=warmup_steps, + num_train_steps=num_train_steps, + learning_rate=learning_rate, + model_cls=model, + weight_decay=weight_decay, + scheduler=scheduler, + loss_scaling=loss_scaling, + parameter_list=parameter_list) + else: + return StOptimizer( + warmup_steps=warmup_steps, + num_train_steps=num_train_steps, + learning_rate=learning_rate, + weight_decay=weight_decay, + scheduler=scheduler) diff --git a/hapi/text/bert/static_optimization.py b/hapi/text/bert/static_optimization.py index a577d1bf05091d5101ff49d61ec3aed8fefcbb14..adc8f87ce3269478cab81a5434d9d71fff30e20e 100644 --- a/hapi/text/bert/static_optimization.py +++ b/hapi/text/bert/static_optimization.py @@ -19,7 +19,6 @@ from __future__ import print_function import numpy as np import paddle.fluid as fluid -from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): @@ -51,128 +50,95 @@ def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): return lr -def optimization(loss, +class StOptimizer(fluid.optimizer.Optimizer): + def __init__(self, warmup_steps, num_train_steps, learning_rate, - train_program, - startup_prog, weight_decay, - scheduler='linear_warmup_decay', - use_fp16=False, - use_dynamic_loss_scaling=False, - init_loss_scaling=1.0, - incr_every_n_steps=1000, - decr_every_n_nan_or_inf=2, - incr_ratio=2.0, - decr_ratio=0.8): - - scheduled_lr, loss_scaling = None, None - if scheduler == 'noam_decay': - if warmup_steps > 0: - scheduled_lr = fluid.layers.learning_rate_scheduler\ - .noam_decay(1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) + scheduler='linear_warmup_decay'): + super(StOptimizer, self).__init__( + learning_rate=learning_rate, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None) + self.warmup_steps = warmup_steps + self.num_train_steps = num_train_steps + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.scheduler = scheduler + + def minimize(self, loss): + + train_program = fluid.default_main_program() + startup_program = fluid.default_startup_program() + + if self.scheduler == 'noam_decay': + if self.warmup_steps > 0: + scheduled_lr = fluid.layers.learning_rate_scheduler\ + .noam_decay(1/(self.warmup_steps *(self.learning_rate ** 2)), + self.warmup_steps) + else: + print( + "WARNING: noam decay of learning rate should have postive warmup " + "steps but given {}, using constant learning rate instead!" + .format(self.warmup_steps)) + scheduled_lr = fluid.layers.create_global_var( + name=fluid.unique_name.generate("learning_rate"), + shape=[1], + value=self.learning_rate, + dtype='float32', + persistable=True) + elif self.scheduler == 'linear_warmup_decay': + if self.warmup_steps > 0: + scheduled_lr = linear_warmup_decay(self.learning_rate, + self.warmup_steps, + self.num_train_steps) + else: + print( + "WARNING: linear warmup decay of learning rate should have " + "postive warmup steps but given {}, use constant learning rate " + "instead!".format(self.warmup_steps)) + scheduled_lr = fluid.layers.create_global_var( + name=fluid.unique_name.generate("learning_rate"), + shape=[1], + value=self.learning_rate, + dtype='float32', + persistable=True) else: - print( - "WARNING: noam decay of learning rate should have postive warmup " - "steps but given {}, using constant learning rate instead!" - .format(warmup_steps)) - scheduled_lr = fluid.layers.create_global_var( - name=fluid.unique_name.generate("learning_rate"), - shape=[1], - value=learning_rate, - dtype='float32', - persistable=True) - elif scheduler == 'linear_warmup_decay': - if warmup_steps > 0: - scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, - num_train_steps) - else: - print( - "WARNING: linear warmup decay of learning rate should have " - "postive warmup steps but given {}, use constant learning rate " - "instead!".format(warmup_steps)) - scheduled_lr = fluid.layers.create_global_var( - name=fluid.unique_name.generate("learning_rate"), - shape=[1], - value=learning_rate, - dtype='float32', - persistable=True) - else: - raise ValueError("Unkown learning rate scheduler, should be " - "'noam_decay' or 'linear_warmup_decay'") - - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) - - def exclude_from_weight_decay(param): - name = param.name.rstrip(".master") - if name.find("layer_norm") > -1: - return True - bias_suffix = ["_bias", "_b", ".b_0"] - for suffix in bias_suffix: - if name.endswith(suffix): - return True - return False - - param_list = dict() - - if use_fp16: - loss_scaling = fluid.layers.create_global_var( - name=fluid.unique_name.generate("loss_scaling"), - shape=[1], - value=init_loss_scaling, - dtype='float32', - persistable=True) - loss *= loss_scaling - - param_grads = optimizer.backward(loss) - master_param_grads = create_master_params_grads( - param_grads, train_program, startup_prog, loss_scaling) - - if weight_decay > 0: - for param, _ in master_param_grads: - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True + raise ValueError("Unkown learning rate scheduler, should be " + "'noam_decay' or 'linear_warmup_decay'") - if use_dynamic_loss_scaling: - apply_dynamic_loss_scaling( - loss_scaling, master_param_grads, incr_every_n_steps, - decr_every_n_nan_or_inf, incr_ratio, decr_ratio) + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) - optimizer.apply_gradients(master_param_grads) - - if weight_decay > 0: - for param, grad in master_param_grads: - if exclude_from_weight_decay(param): - continue - with param.block.program._optimized_guard( - [param, grad]), fluid.framework.name_scope("weight_decay"): - updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr - fluid.layers.assign(output=param, input=updated_param) + def exclude_from_weight_decay(param): + name = param.name.rstrip(".master") + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False - master_param_to_train_param(master_param_grads, param_grads, - train_program) + param_list = dict() - else: - if weight_decay > 0: + if self.weight_decay > 0: for param in train_program.all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) - if weight_decay > 0: + if self.weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr + param.name] * self.weight_decay * scheduled_lr fluid.layers.assign(output=param, input=updated_param) - - return scheduled_lr, loss_scaling diff --git a/hapi/text/text.py b/hapi/text/text.py index 2b99f81f58dda36d59275790eb9acf767552de6b..ed803ae08eb16eed596e7097cb1c5fb6e1de2dbe 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -1096,7 +1096,8 @@ class PrePostProcessLayer(Layer): self.functors = [] for cmd in self.process_cmd: if cmd == "a": # add residual connection - self.functors.append(lambda x, y: x + y if y else x) + self.functors.append( + lambda x, y: x + y if y is not None else x) elif cmd == "n": # add layer normalization if reused_layer_norm is not None: layer_norm = reused_layer_norm @@ -1218,7 +1219,7 @@ class MultiHeadAttention(Layer): # scale dot product attention product = layers.matmul( x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) - if attn_bias: + if attn_bias is not None: product += attn_bias weights = layers.softmax(product) if self.dropout_rate: