From b45479eec9e1de86723bb8f679afdb21e0c035fe Mon Sep 17 00:00:00 2001 From: Zeyu Chen Date: Fri, 29 Mar 2019 00:31:49 +0800 Subject: [PATCH] add get_vocab_path api for bert module --- demo/bert-cls/create_module.py | 182 ----------------------------- demo/bert-cls/finetune_with_hub.py | 7 +- demo/bert-cls/run_create_module.sh | 29 ----- paddle_hub/module/module.py | 11 +- 4 files changed, 13 insertions(+), 216 deletions(-) delete mode 100644 demo/bert-cls/create_module.py delete mode 100644 demo/bert-cls/run_create_module.sh diff --git a/demo/bert-cls/create_module.py b/demo/bert-cls/create_module.py deleted file mode 100644 index 6afc1353..00000000 --- a/demo/bert-cls/create_module.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Finetuning on classification tasks.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time -import argparse -import numpy as np -import multiprocessing - -import paddle -import paddle.fluid as fluid -import paddle_hub as hub - -import reader.cls as reader -from model.bert import BertConfig -from model.classifier import create_bert_module -from optimization import optimization -from utils.args import ArgumentGroup, print_arguments -from utils.init import init_pretraining_params, init_checkpoint - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -model_g = ArgumentGroup(parser, "model", "model configuration and paths.") -model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.") -model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") -model_g.add_arg("init_pretraining_params", str, None, - "Init pre-training params which preforms fine-tuning from. If the " - "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") -model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") - -train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") -train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") -train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", - "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) -train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") -train_g.add_arg("warmup_proportion", float, 0.1, - "Proportion of training steps to perform linear learning rate warmup for.") -train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") -train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") -train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") -train_g.add_arg("loss_scaling", float, 1.0, - "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") - -log_g = ArgumentGroup(parser, "logging", "logging related.") -log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") -log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") - -data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") -data_g.add_arg("data_dir", str, None, "Path to training data.") -data_g.add_arg("vocab_path", str, None, "Vocabulary path.") -data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") -data_g.add_arg("in_tokens", bool, False, - "If set, the batch size will be the maximum number of tokens in one batch. " - "Otherwise, it will be the maximum number of examples in one batch.") -data_g.add_arg("do_lower_case", bool, True, - "Whether to lower case the input text. Should be True for uncased models and False for cased models.") -data_g.add_arg("random_seed", int, 0, "Random seed.") - -run_type_g = ArgumentGroup(parser, "run_type", "running type options.") -run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") -run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") -run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.") -run_type_g.add_arg("task_name", str, None, - "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.") -run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") -run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") -run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") - -args = parser.parse_args() -# yapf: enable. - - -def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): - test_pyreader.start() - total_cost, total_acc, total_num_seqs = [], [], [] - time_begin = time.time() - while True: - try: - np_loss, np_acc, np_num_seqs = exe.run( - program=test_program, fetch_list=fetch_list) - total_cost.extend(np_loss * np_num_seqs) - total_acc.extend(np_acc * np_num_seqs) - total_num_seqs.extend(np_num_seqs) - except fluid.core.EOFException: - test_pyreader.reset() - break - time_end = time.time() - print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" % - (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs), - np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin)) - - -def main(args): - bert_config = BertConfig(args.bert_config_path) - bert_config.print_config() - - if args.use_cuda: - place = fluid.CUDAPlace(0) - dev_count = fluid.core.get_cuda_device_count() - else: - place = fluid.CPUPlace() - dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - - task_name = args.task_name.lower() - processors = { - 'xnli': reader.XnliProcessor, - 'cola': reader.ColaProcessor, - 'mrpc': reader.MrpcProcessor, - 'mnli': reader.MnliProcessor, - 'chnsenticorp': reader.ChnsenticorpProcessor - } - - processor = processors[task_name]( - data_dir=args.data_dir, - vocab_path=args.vocab_path, - max_seq_len=args.max_seq_len, - do_lower_case=args.do_lower_case, - in_tokens=args.in_tokens, - random_seed=args.random_seed) - num_labels = len(processor.get_labels()) - - startup_prog = fluid.Program() - train_program = fluid.Program() - with fluid.program_guard(train_program, startup_prog): - with fluid.unique_name.guard(): - src_ids, pos_ids, sent_ids, input_mask, pooled_output, sequence_output = create_bert_module( - args, - pyreader_name='train_reader', - bert_config=bert_config, - num_labels=num_labels) - - exe = fluid.Executor(place) - exe.run(startup_prog) - - init_pretraining_params( - exe, - args.init_pretraining_params, - main_program=startup_prog, - use_fp16=args.use_fp16) - - pooled_output_sign = hub.create_signature( - "pooled_output", - inputs=[src_ids, pos_ids, sent_ids, input_mask], - outputs=[pooled_output], - feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"], - fetch_names=["pooled_output"]) - - sequence_output_sign = hub.create_signature( - "sequence_output", - inputs=[src_ids, pos_ids, sent_ids, input_mask], - outputs=[sequence_output], - feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"], - fetch_names=["sequence_output"]) - - hub.create_module( - sign_arr=[pooled_output_sign, sequence_output_sign], - module_dir="./chinese_L-12_H-768_A-12.hub_module", - exe=exe, - assets=[]) - - -if __name__ == '__main__': - print_arguments(args) - main(args) diff --git a/demo/bert-cls/finetune_with_hub.py b/demo/bert-cls/finetune_with_hub.py index 78576131..aa9541c2 100644 --- a/demo/bert-cls/finetune_with_hub.py +++ b/demo/bert-cls/finetune_with_hub.py @@ -89,9 +89,13 @@ if __name__ == '__main__': optimizer=None, warmup_proportion=args.warmup_proportion) + module = hub.Module( + module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module") + + print("vocab_path = {}".format(module.get_vocab_path())) processor = reader.ChnsenticorpProcessor( data_dir=args.data_dir, - vocab_path=args.vocab_path, + vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, @@ -100,7 +104,6 @@ if __name__ == '__main__': num_labels = len(processor.get_labels()) # loading paddlehub BERT - module = hub.Module(module_dir="./chinese_L-12_H-768_A-12.hub_module") # bert's input tensor, output tensor and forward graph # If you want to fine-tune the pretrain model parameter, please set diff --git a/demo/bert-cls/run_create_module.sh b/demo/bert-cls/run_create_module.sh deleted file mode 100644 index 506610e7..00000000 --- a/demo/bert-cls/run_create_module.sh +++ /dev/null @@ -1,29 +0,0 @@ -export FLAGS_enable_parallel_graph=1 -export FLAGS_sync_nccl_allreduce=1 -export CUDA_VISIBLE_DEVICES=0 - -BERT_BASE_PATH="chinese_L-12_H-768_A-12" -TASK_NAME='chnsenticorp' -DATA_PATH=chnsenticorp_data -CKPT_PATH=chn_checkpoints - -python -u create_module.py --task_name ${TASK_NAME} \ - --use_cuda true \ - --do_train true \ - --do_val true \ - --do_test true \ - --batch_size 4096 \ - --in_tokens true \ - --init_pretraining_params ${BERT_BASE_PATH}/params \ - --data_dir ${DATA_PATH} \ - --vocab_path ${BERT_BASE_PATH}/vocab.txt \ - --checkpoints ${CKPT_PATH} \ - --save_steps 100 \ - --weight_decay 0.01 \ - --warmup_proportion 0.0 \ - --validation_steps 50 \ - --epoch 3 \ - --max_seq_len 128 \ - --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ - --learning_rate 5e-5 \ - --skip_steps 10 diff --git a/paddle_hub/module/module.py b/paddle_hub/module/module.py index 200387ba..d838d7bb 100644 --- a/paddle_hub/module/module.py +++ b/paddle_hub/module/module.py @@ -98,7 +98,6 @@ class Module: self.default_signature = None self.module_info = None self.processor = None - self.assets = [] self.name = "temp" if url: self._init_with_url(url=url) @@ -111,8 +110,8 @@ class Module: ), "processor should be sub class of hub.BaseProcessor" if assets: self.assets = utils.to_list(assets) - for asset in assets: - utils.check_path(assets) + # for asset in assets: + # utils.check_path(assets) self.processor = processor self._generate_module_info(module_info) self._init_with_signature(signatures=signatures) @@ -254,6 +253,12 @@ class Module: self.__dict__[sign] = functools.partial( self.__call__, sign_name=sign) + def get_vocab_path(self): + for assets_file in self.assets: + print(assets_file) + if "vocab.txt" in assets_file: + return assets_file + def _recover_from_desc(self): # recover signature for sign, module_var in self.desc.sign2var.items(): -- GitLab