diff --git a/demo/bert-cls/finetune_with_hub.py b/demo/bert-cls/finetune_with_hub.py index d5252499a6e9c4dd19668186ab8743ec9def25d5..34177f46319de717b20473eaa19bf58a4903dafb 100644 --- a/demo/bert-cls/finetune_with_hub.py +++ b/demo/bert-cls/finetune_with_hub.py @@ -70,8 +70,21 @@ run_type_g.add_arg("use_cuda", bool, True, "If set, use G args = parser.parse_args() # yapf: enable. - -def test_hub_api(args, config): +if __name__ == '__main__': + print_arguments(args) + config = FinetuneConfig( + log_interval=10, + eval_interval=100, + save_ckpt_interval=200, + use_cuda=True, + checkpoint_dir="./bert_cls_ckpt", + learning_rate=args.learning_rate, + num_epoch=args.epoch, + batch_size=args.batch_size, + max_seq_len=args.max_seq_len, + weight_decay=args.weight_decay, + in_tokens=args.in_tokens, + warmup_proportion=args.warmup_proportion) processor = reader.ChnsenticorpProcessor( data_dir=args.data_dir, @@ -86,38 +99,28 @@ def test_hub_api(args, config): # loading paddlehub BERT module = hub.Module(module_dir="./chinese_L-12_H-768_A-12.hub_module") + # bert's input tensor, output tensor and forward graph + # If you want to fine-tune the pretrain model parameter, please set + # trainable to True input_dict, output_dict, train_program = module.context( sign_name="pooled_output", trainable=True) - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): + with fluid.program_guard(train_program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') pooled_output = output_dict["pooled_output"] - # setup feed list for data feeder + # Setup feed list for data feeder + # Must feed all the tensor of bert's module need feed_list = [ input_dict["src_ids"].name, input_dict["pos_ids"].name, input_dict["sent_ids"].name, input_dict["input_mask"].name, label.name ] + # Define a classfication finetune task by PaddleHub's API cls_task = hub.append_mlp_classifier( pooled_output, label, num_classes=num_labels) + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval(cls_task, feed_list, processor, config) - - -if __name__ == '__main__': - print_arguments(args) - config = FinetuneConfig( - stat_interval=args.skip_steps, - eval_interval=args.validation_steps, - use_cuda=True, - learning_rate=args.learning_rate, - weight_decay=args.weight_decay, - in_tokens=args.in_tokens, - epoch=args.epoch, - batch_size=args.batch_size, - max_seq_len=args.max_seq_len, - warmup_proportion=args.warmup_proportion) - test_hub_api(args, config) diff --git a/demo/bert-cls/run_fintune_with_hub.sh b/demo/bert-cls/run_fintune_with_hub.sh index a446c14efda49dffe88cc7751ffc2362be1e87b3..a54d88cab4815ec1fa8da4ac98657b20662c748b 100644 --- a/demo/bert-cls/run_fintune_with_hub.sh +++ b/demo/bert-cls/run_fintune_with_hub.sh @@ -7,8 +7,8 @@ DATA_PATH=chnsenticorp_data rm -rf $CKPT_PATH python -u finetune_with_hub.py \ --use_cuda true \ - --batch_size 4096 \ - --in_tokens true \ + --batch_size 32 \ + --in_tokens false \ --data_dir ${DATA_PATH} \ --vocab_path ${BERT_BASE_PATH}/vocab.txt \ --weight_decay 0.01 \ diff --git a/paddle_hub/finetune/config.py b/paddle_hub/finetune/config.py index f055064e2a7ebc13523da5c4b612fc74cd13a92d..755d1b49fb402b3c84175b6b89149f5491033f46 100644 --- a/paddle_hub/finetune/config.py +++ b/paddle_hub/finetune/config.py @@ -14,8 +14,20 @@ import collections -FinetuneConfig = collections.namedtuple('FinetuneConfig', [ - 'stat_interval', 'eval_interval', 'use_cuda', 'learning_rate', - 'weight_decay', 'in_tokens', 'epoch', 'batch_size', 'max_seq_len', - 'warmup_proportion' -]) +FinetuneConfig = collections.namedtuple( + 'FinetuneConfig', + [ + 'log_interval', # print training log every n step + 'eval_interval', # evalution the model every n steps + 'save_ckpt_interval', # save the model checkpoint every n steps + 'use_cuda', # use gpu or not + 'learning_rate', + 'checkpoint_dir', # model checkpoint directory + 'num_epoch', # number of finetune epoch + 'batch_size', + # for bert parameter + 'max_seq_len', # for bert + 'weight_decay', # for bert + 'warmup_proportion', # for bert + 'in_tokens' # for bert + ]) diff --git a/paddle_hub/finetune/finetune.py b/paddle_hub/finetune/finetune.py index 50a6a27f6e5d4e8255e6f4e3867b121b8f0fb3bb..db1c99b2513bcd86b3ef082ce9f566977a89d21c 100644 --- a/paddle_hub/finetune/finetune.py +++ b/paddle_hub/finetune/finetune.py @@ -30,22 +30,7 @@ def finetune_and_eval(task, feed_list, data_processor, config=None): else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - - # data generator - data_generator = { - 'train': - data_processor.data_generator( - batch_size=config.batch_size, - phase='train', - epoch=config.epoch, - shuffle=False), - 'test': - data_processor.data_generator( - batch_size=config.batch_size, phase='test', shuffle=False), - 'dev': - data_processor.data_generator( - batch_size=config.batch_size, phase='dev', shuffle=False) - } + exe = fluid.Executor(place) # hub.finetune_and_eval start here #TODO: to simplify @@ -56,10 +41,10 @@ def finetune_and_eval(task, feed_list, data_processor, config=None): num_train_examples = data_processor.get_num_examples(phase='train') if config.in_tokens: - max_train_steps = config.epoch * num_train_examples // ( + max_train_steps = config.num_epoch * num_train_examples // ( config.batch_size // config.max_seq_len) // dev_count else: - max_train_steps = config.epoch * num_train_examples // config.batch_size // dev_count + max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count warmup_steps = int(max_train_steps * config.warmup_proportion) @@ -83,73 +68,80 @@ def finetune_and_eval(task, feed_list, data_processor, config=None): num_example.name ]) - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) exe.run(startup_program) feeder = fluid.DataFeeder(feed_list=feed_list, place=place) # Traning block # prepare training dataset - train_data_generator = data_generator['train'] total_loss, total_acc, total_num_example = [], [], [] step = 0 time_begin = time.time() train_time_used = 0.0 - for example in train_data_generator(): - step += 1 - train_time_begin = time.time() - np_loss, np_acc, np_num_example = exe.run( - program=train_program, - feed=feeder.feed([example]), - fetch_list=[loss, accuracy, num_example]) - train_time_used += time.time() - train_time_begin + for epoch in range(1, config.num_epoch + 1): + print("Epoch {}".format(epoch)) + train_data_generator = data_processor.data_generator( + batch_size=config.batch_size, phase='train', shuffle=False) + for example in train_data_generator(): + step += 1 + train_time_begin = time.time() + np_loss, np_acc, np_num_example = exe.run( + program=train_program, + feed=feeder.feed([example]), + fetch_list=[loss, accuracy, num_example]) + train_time_used += time.time() - train_time_begin + + # Statistic Block + total_loss.extend(np_loss * np_num_example) + total_acc.extend(np_acc * np_num_example) + total_num_example.extend(np_num_example) + if step % config.log_interval == 0: + # get training progress + accum_num_example = np.sum(total_num_example) + print( + "step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format( + step, + np.sum(total_loss) / accum_num_example, + np.sum(total_acc) / accum_num_example, + config.log_interval / train_time_used)) + # reset statistic variables + total_loss, total_acc, total_num_example = [], [], [] + train_time_used = 0.0 + + # Evaluation block + if step % config.eval_interval == 0: + test_data_generator = data_processor.data_generator( + batch_size=config.batch_size, phase='test', shuffle=False) + dev_data_generator = data_processor.data_generator( + batch_size=config.batch_size, phase='dev', shuffle=False) + evaluate(task, test_program, exe, feeder, dev_data_generator) + evaluate(task, test_program, exe, feeder, test_data_generator) + + # Save model checkpoint + if step % config.save_ckpt_interval == 0: + save_checkpoint(exe, train_program, step, config.checkpoint_dir) + + # finish final evaluation on testset + test_data_generator = data_processor.data_generator( + batch_size=config.batch_size, phase='test', shuffle=False) + evaluate(task, test_program, exe, feeder, test_data_generator) + + +def save_checkpoint(exe, train_program, step, ckpt_dir): + #TODO: add global step variable for restore checkpoint like tensorflow + ckpt_step_dir = os.path.join(ckpt_dir, "step_{}".format(step)) + fluid.io.save_persistables(exe, ckpt_step_dir, train_program) + + +def evaluate(task, test_program, exe, feeder, data_generator): + loss = task.variable("loss") + probs = task.variable("probs") + accuracy = task.variable("accuracy") + num_example = task.variable("num_example") - # Statistic Block - total_loss.extend(np_loss * np_num_example) - total_acc.extend(np_acc * np_num_example) - total_num_example.extend(np_num_example) - if step % config.stat_interval == 0: - # get training progress - accum_num_example = np.sum(total_num_example) - print("step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format( - step, - np.sum(total_loss) / accum_num_example, - np.sum(total_acc) / accum_num_example, - config.stat_interval / train_time_used)) - # reset statistic variables - total_loss, total_acc, total_num_example = [], [], [] - train_time_used = 0.0 - - # Evaluation block - if step % config.eval_interval == 0: - evaluate(test_program, exe, data_generator) - - if step % config.eval_interval == 0: - # Final Test Block - total_loss, total_acc, total_num_example = [], [], [] - test_data_generator = data_generator['test'] - for example in test_data_generator(): - np_loss, np_acc, np_num_example = exe.run( - program=test_program, - feed=feeder.feed([example]), - fetch_list=[loss, accuracy, num_example]) - total_loss.extend(np_loss * np_num_example) - total_acc.extend(np_acc * np_num_example) - total_num_example.extend(np_num_example) - accum_num_example = np.sum(total_num_example) - print("[Final Test] loss={:.5f} acc={:.5f}".format( - np.sum(total_loss) / accum_num_example, - np.sum(total_acc) / accum_num_example)) - - -def evaluate(test_program, exe, feeder, data_generator): - print("Evaluation start") total_loss, total_acc, total_num_example = [], [], [] - dev_data_generator = data_generator['dev'] - eval_step = 0 eval_time_begin = time.time() - for example in dev_data_generator(): + for example in data_generator(): eval_step += 1 np_loss, np_acc, np_num_example = exe.run( program=test_program, @@ -160,6 +152,6 @@ def evaluate(test_program, exe, feeder, data_generator): total_num_example.extend(np_num_example) eval_time_used = time.time() - eval_time_begin accum_num_example = np.sum(total_num_example) - print("[Evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format( + print("[evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format( np.sum(total_loss) / accum_num_example, np.sum(total_acc) / accum_num_example, eval_step / eval_time_used)) diff --git a/paddle_hub/finetune/optimization.py b/paddle_hub/finetune/optimization.py index 620e24bad55b991c3f96972566b0216ae6d4138a..6a7cbcd5ac1ba145cd94c98004bd2e7d04934bfb 100644 --- a/paddle_hub/finetune/optimization.py +++ b/paddle_hub/finetune/optimization.py @@ -49,95 +49,6 @@ def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): return lr -def optimization(loss, - warmup_steps, - num_train_steps, - learning_rate, - train_program, - startup_prog, - weight_decay, - scheduler='linear_warmup_decay', - use_fp16=False, - loss_scaling=1.0): - if warmup_steps > 0: - if scheduler == 'noam_decay': - scheduled_lr = fluid.layers.learning_rate_scheduler\ - .noam_decay(1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) - elif scheduler == 'linear_warmup_decay': - scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, - num_train_steps) - else: - raise ValueError("Unkown learning rate scheduler, should be " - "'noam_decay' or 'linear_warmup_decay'") - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) - else: - optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) - scheduled_lr = learning_rate - - clip_norm_thres = 1.0 - # When using mixed precision training, scale the gradient clip threshold - # by loss_scaling - if use_fp16 and loss_scaling > 1.0: - clip_norm_thres *= loss_scaling - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) - - def exclude_from_weight_decay(name): - if name.find("layer_norm") > -1: - return True - bias_suffix = ["_bias", "_b", ".b_0"] - for suffix in bias_suffix: - if name.endswith(suffix): - return True - return False - - param_list = dict() - - if use_fp16: - param_grads = optimizer.backward(loss) - master_param_grads = create_master_params_grads( - param_grads, train_program, startup_prog, loss_scaling) - - for param, _ in master_param_grads: - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True - - optimizer.apply_gradients(master_param_grads) - - if weight_decay > 0: - for param, grad in master_param_grads: - if exclude_from_weight_decay(param.name.rstrip(".master")): - continue - with param.block.program._optimized_guard( - [param, grad]), fluid.framework.name_scope("weight_decay"): - updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr - fluid.layers.assign(output=param, input=updated_param) - - master_param_to_train_param(master_param_grads, param_grads, - train_program) - - else: - for param in train_program.global_block().all_parameters(): - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True - - _, param_grads = optimizer.minimize(loss) - - if weight_decay > 0: - for param, grad in param_grads: - if exclude_from_weight_decay(param.name): - continue - with param.block.program._optimized_guard( - [param, grad]), fluid.framework.name_scope("weight_decay"): - updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr - fluid.layers.assign(output=param, input=updated_param) - - return scheduled_lr - - def bert_optimization(loss, warmup_steps, num_train_steps,