diff --git a/model_zoo/official/nlp/bert/run_pretrain.py b/model_zoo/official/nlp/bert/run_pretrain.py index 749f6f6236e937f9def471e3b4f5e904fec0d146..1f31ff4015b55a163aea5a246a4aa4c693246834 100644 --- a/model_zoo/official/nlp/bert/run_pretrain.py +++ b/model_zoo/official/nlp/bert/run_pretrain.py @@ -106,6 +106,7 @@ def run_pretrain(): new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() + logger.info("train steps: {}".format(args_opt.train_steps)) if cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate(learning_rate=cfg.Lamb.learning_rate, @@ -117,7 +118,8 @@ def run_pretrain(): decay_params = list(filter(cfg.Lamb.decay_filter, params)) other_params = list(filter(lambda x: x not in decay_params, params)) group_params = [{'params': decay_params, 'weight_decay': cfg.Lamb.weight_decay}, - {'params': other_params}] + {'params': other_params}, + {'order_params': params}] optimizer = Lamb(group_params, learning_rate=lr_schedule, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(net_with_loss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, @@ -132,7 +134,8 @@ def run_pretrain(): decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list(filter(lambda x: x not in decay_params, params)) group_params = [{'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay}, - {'params': other_params, 'weight_decay': 0.0}] + {'params': other_params, 'weight_decay': 0.0}, + {'order_params': params}] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) else: diff --git a/model_zoo/official/nlp/bert/src/config.py b/model_zoo/official/nlp/bert/src/config.py index e553f4b0385d8876f6aa0c5ec50f1237633e727e..2341007bd4478bb86d52f1d76f5220f87e58990c 100644 --- a/model_zoo/official/nlp/bert/src/config.py +++ b/model_zoo/official/nlp/bert/src/config.py @@ -26,7 +26,7 @@ cfg = edict({ 'optimizer': 'Lamb', 'AdamWeightDecay': edict({ 'learning_rate': 3e-5, - 'end_learning_rate': 1e-10, + 'end_learning_rate': 0.0, 'power': 5.0, 'weight_decay': 1e-5, 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), @@ -35,7 +35,7 @@ cfg = edict({ }), 'Lamb': edict({ 'learning_rate': 3e-5, - 'end_learning_rate': 1e-10, + 'end_learning_rate': 0.0, 'power': 10.0, 'warmup_steps': 10000, 'weight_decay': 0.01,