From fc72dc607f452fc821a3c9ced06871d481c2f314 Mon Sep 17 00:00:00 2001 From: Zeyu Chen Date: Fri, 29 Mar 2019 00:00:01 +0800 Subject: [PATCH] migrate bert to latest interface --- demo/bert-cls/finetune_with_hub.py | 11 ++- demo/bert-cls/run_fintune_with_hub.sh | 2 +- paddle_hub/finetune/config.py | 6 +- paddle_hub/finetune/finetune.py | 66 +++++++-------- paddle_hub/finetune/network.py | 1 - paddle_hub/finetune/optimization.py | 117 ++++++++++++++++++-------- paddle_hub/finetune/task.py | 3 - 7 files changed, 125 insertions(+), 81 deletions(-) diff --git a/demo/bert-cls/finetune_with_hub.py b/demo/bert-cls/finetune_with_hub.py index 34177f46..78576131 100644 --- a/demo/bert-cls/finetune_with_hub.py +++ b/demo/bert-cls/finetune_with_hub.py @@ -83,7 +83,10 @@ if __name__ == '__main__': batch_size=args.batch_size, max_seq_len=args.max_seq_len, weight_decay=args.weight_decay, - in_tokens=args.in_tokens, + finetune_strategy="bert_finetune", + with_memory_optimization=True, + in_tokens=True, + optimizer=None, warmup_proportion=args.warmup_proportion) processor = reader.ChnsenticorpProcessor( @@ -123,4 +126,8 @@ if __name__ == '__main__': # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically - hub.finetune_and_eval(cls_task, feed_list, processor, config) + hub.finetune_and_eval( + task=cls_task, + data_processor=processor, + feed_list=feed_list, + config=config) diff --git a/demo/bert-cls/run_fintune_with_hub.sh b/demo/bert-cls/run_fintune_with_hub.sh index a54d88ca..c1b82def 100644 --- a/demo/bert-cls/run_fintune_with_hub.sh +++ b/demo/bert-cls/run_fintune_with_hub.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=6 +export CUDA_VISIBLE_DEVICES=2 BERT_BASE_PATH="chinese_L-12_H-768_A-12" TASK_NAME='chnsenticorp' diff --git a/paddle_hub/finetune/config.py b/paddle_hub/finetune/config.py index 374d26e7..7bc64359 100644 --- a/paddle_hub/finetune/config.py +++ b/paddle_hub/finetune/config.py @@ -30,6 +30,8 @@ FinetuneConfig = collections.namedtuple( 'weight_decay', # for bert 'warmup_proportion', # for bert 'in_tokens', # for bert - 'strategy', - 'with_memory_optimization' + 'finetune_strategy', + 'with_memory_optimization', + # learning rate scheduler + 'optimizer' ]) diff --git a/paddle_hub/finetune/finetune.py b/paddle_hub/finetune/finetune.py index 2bb0909e..8d54ebd7 100644 --- a/paddle_hub/finetune/finetune.py +++ b/paddle_hub/finetune/finetune.py @@ -23,17 +23,7 @@ import paddle import paddle.fluid as fluid from paddle_hub.tools.logger import logger - - -def optimizer_config_for_strategy(strategy, parameters, data_processor, - dev_count): - # basic configuration - learning_rate = 1e-4 - optimizer = fluid.optimizer.Adam(learning_rate) - regularizer = fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4) - - return optimizer +from paddle_hub.finetune.optimization import bert_finetune def _finetune_model(task, @@ -51,12 +41,10 @@ def _finetune_model(task, learning_rate = config.learning_rate use_cuda = config.use_cuda batch_size = config.batch_size - strategy = config.strategy with_memory_optimization = config.with_memory_optimization checkpoint_dir = config.checkpoint_dir with fluid.program_guard(main_program, startup_program): - if use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() @@ -65,17 +53,20 @@ def _finetune_model(task, dev_count = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - optimizer = optimizer_config_for_strategy( - strategy=strategy, - parameters=None, - data_processor=data_processor, - dev_count=dev_count) - data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) exe = fluid.Executor(place=place) - optimizer.minimize(loss) + data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + + if config.finetune_strategy == "bert_finetune": + scheduled_lr = bert_finetune(task, main_program, data_processor, + config, dev_count) + elif config.optimizer == "adam": + optimzier = fluid.optimizer.Adam(learning_rate=config.learning_rate) + optimizer.minimize(loss) + #TODO: add more finetune strategy if with_memory_optimization: - logger.info("Memory optimize start") + logger.info("Memory optimization start...") + optimize_time_begin = time.time() fluid.memory_optimize( input_program=fluid.default_main_program(), skip_opt_set=[ @@ -83,7 +74,9 @@ def _finetune_model(task, loss.name, accuracy.name ]) - logger.info("Memory optimize end") + time_used = time.time() - optimize_time_begin + logger.info( + "Memory optimization done! Time elapsed %f sec" % time_used) # initilize all parameters exe.run(fluid.default_startup_program()) @@ -91,13 +84,12 @@ def _finetune_model(task, logger.info("Finetune start") train_time_begin = time.time() for index in range(epoch): - train_reader = paddle.batch( - data_processor.data_generator(phase='train'), - batch_size=batch_size) + train_reader = data_processor.data_generator( + batch_size=batch_size, phase='train') size = accuracy_sum = loss_sum = 0 for batch in train_reader(): loss_v, accuracy_v = exe.run( - feed=data_feeder.feed(batch), + feed=data_feeder.feed([batch]), fetch_list=[loss.name, accuracy.name]) step += 1 size += len(batch) @@ -106,16 +98,16 @@ def _finetune_model(task, if step % config.log_interval == 0: train_time_used = time.time() - train_time_begin - perf = train_time_used / config.log_interval + speed = config.log_interval / train_time_used train_time_begin = time.time() logger.info( "step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" % - (step, loss_sum / size, accuracy_sum / size, perf)) + (step, loss_sum / size, accuracy_sum / size, speed)) size = accuracy_sum = loss_sum = 0 if step % config.save_ckpt_interval == 0: - model_save_dir = os.path.join( - checkpoint_dir, "model_parameters_in_step%d" % step) + model_save_dir = os.path.join(checkpoint_dir, + "step_%d" % step) fluid.io.save_persistables(exe, dirname=model_save_dir) if eval_model and step % config.eval_interval == 0: @@ -123,7 +115,7 @@ def _finetune_model(task, # eval before end if eval_model: eval(task, data_processor, feed_list, config) - logger.info("Finetune end") + logger.info("Finetune finished") def save_model_and_checkpoint(task, save_dir): @@ -150,22 +142,22 @@ def eval(task, data_processor, feed_list, config=None): accuracy = task.variable("accuracy") use_cuda = config.use_cuda batch_size = config.batch_size - logger.info("[Evaluation] start") with fluid.program_guard(inference_program): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) exe = fluid.Executor(place=place) size = accuracy_sum = loss_sum = 0 - test_reader = paddle.batch( - data_processor.data_generator(phase='test'), batch_size=batch_size) + test_reader = data_processor.data_generator( + batch_size=batch_size, phase='test') eval_time_begin = time.time() for index, batch in enumerate(test_reader()): loss_v, accuracy_v, = exe.run( - feed=data_feeder.feed(batch), fetch_list=[loss, accuracy.name]) + feed=data_feeder.feed([batch]), + fetch_list=[loss, accuracy.name]) size += len(batch) accuracy_sum += accuracy_v * len(batch) loss_sum += loss_v * len(batch) eval_time_used = time.time() - eval_time_begin - perf = eval_time_used / index + eval_speed = index / eval_time_used logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" % - (loss_sum / size, accuracy_sum / size, perf)) + (loss_sum / size, accuracy_sum / size, eval_speed)) diff --git a/paddle_hub/finetune/network.py b/paddle_hub/finetune/network.py index 1ee14178..48f5804b 100644 --- a/paddle_hub/finetune/network.py +++ b/paddle_hub/finetune/network.py @@ -19,7 +19,6 @@ import time import numpy as np import multiprocessing -from paddle_hub.finetune.optimization import bert_optimization from .task import Task __all__ = ['append_mlp_classifier'] diff --git a/paddle_hub/finetune/optimization.py b/paddle_hub/finetune/optimization.py index 6a7cbcd5..8e489d7a 100644 --- a/paddle_hub/finetune/optimization.py +++ b/paddle_hub/finetune/optimization.py @@ -19,43 +19,90 @@ from __future__ import print_function import numpy as np import paddle.fluid as fluid +""" +Finetune optimization strategy +""" -def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): - """ Applies linear warmup of learning rate from 0 and decay to 0.""" - with fluid.default_main_program()._lr_schedule_guard(): - lr = fluid.layers.tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="scheduled_learning_rate") - - global_step = fluid.layers.learning_rate_scheduler._decay_step_counter() - - with fluid.layers.control_flow.Switch() as switch: - with switch.case(global_step < warmup_steps): - warmup_lr = learning_rate * (global_step / warmup_steps) - fluid.layers.tensor.assign(warmup_lr, lr) - with switch.default(): - decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay( - learning_rate=learning_rate, - decay_steps=num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False) - fluid.layers.tensor.assign(decayed_lr, lr) - - return lr - - -def bert_optimization(loss, - warmup_steps, - num_train_steps, - learning_rate, - train_program, - weight_decay, - scheduler='linear_warmup_decay'): +def bert_finetune(task, train_program, data_processor, config, dev_count): + # calculate wamrup step + num_train_examples = data_processor.get_num_examples(phase='train') + max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count + warmup_steps = int(max_train_steps * config.warmup_proportion) + + loss = task.variable("loss") + scheduled_lr = adam_weight_decay_optimizer_with_linear_warmup( + loss, warmup_steps, max_train_steps, config.learning_rate, + train_program, config.weight_decay) + + return scheduled_lr + + +def adam_weight_decay_optimizer_with_noam_decay( + loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + weight_decay, + scheduler='linear_warmup_decay'): + if warmup_steps > 0: + if scheduler == 'noam_decay': + scheduled_lr = fluid.layers.learning_rate_scheduler\ + .noam_decay(1/(warmup_steps *(learning_rate ** 2)), + warmup_steps) + elif scheduler == 'linear_warmup_decay': + scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, + num_train_steps) + else: + raise ValueError("Unkown learning rate scheduler, should be " + "'noam_decay' or 'linear_warmup_decay'") + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + else: + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + scheduled_lr = learning_rate + + clip_norm_thres = 1.0 + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) + + def exclude_from_weight_decay(name): + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + param_list = dict() + + for param in train_program.global_block().all_parameters(): + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + _, param_grads = optimizer.minimize(loss) + + if weight_decay > 0: + for param, grad in param_grads: + if exclude_from_weight_decay(param.name): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * weight_decay * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + return scheduled_lr + + +def adam_weight_decay_optimizer_with_linear_warmup(loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + weight_decay, + scheduler='noam_decay'): if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ diff --git a/paddle_hub/finetune/task.py b/paddle_hub/finetune/task.py index 9275e95d..0912181a 100644 --- a/paddle_hub/finetune/task.py +++ b/paddle_hub/finetune/task.py @@ -19,9 +19,6 @@ import time import numpy as np import multiprocessing -from paddle_hub.finetune.optimization import bert_optimization -from paddle_hub.finetune.config import FinetuneConfig - class Task(object): def __init__(self, task_type, graph_var_dict, main_program, -- GitLab