add get_vocab_path api for bert module

b45479ee · Zeyu Chen · fc72dc60 · fc72dc60 · b45479ee · fc72dc60
4 changed file
--- a/demo/bert-cls/create_module.py
+++ b/demo/bert-cls/create_module.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning on classification tasks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import time
-import argparse
-import numpy as np
-import multiprocessing
-import paddle
-import paddle.fluid as fluid
-import paddle_hub as hub
-import reader.cls as reader
-from model.bert import BertConfig
-from model.classifier import create_bert_module
-from optimization import optimization
-from utils.args import ArgumentGroup, print_arguments
-from utils.init import init_pretraining_params, init_checkpoint
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
-model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
-model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
-model_g.add_arg("init_pretraining_params",  str,  None,
-                "Init pre-training params which preforms fine-tuning from. If the "
-                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
-model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
-train_g = ArgumentGroup(parser, "training", "training options.")
-train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
-train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
-train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
-                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
-train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
-train_g.add_arg("warmup_proportion", float,  0.1,
-                "Proportion of training steps to perform linear learning rate warmup for.")
-train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
-train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
-train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
-train_g.add_arg("loss_scaling",      float,  1.0,
-                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
-log_g = ArgumentGroup(parser,     "logging", "logging related.")
-log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
-log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
-data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
-data_g.add_arg("data_dir",      str,  None,  "Path to training data.")
-data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
-data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",    int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
-data_g.add_arg("in_tokens",     bool, False,
-              "If set, the batch size will be the maximum number of tokens in one batch. "
-              "Otherwise, it will be the maximum number of examples in one batch.")
-data_g.add_arg("do_lower_case", bool, True,
-               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
-data_g.add_arg("random_seed",   int,  0,     "Random seed.")
-run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
-run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
-run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
-run_type_g.add_arg("task_name",                    str,    None,
-                   "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
-run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
-run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
-run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")
-args = parser.parse_args()
-# yapf: enable.
-def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase):
-    test_pyreader.start()
-    total_cost, total_acc, total_num_seqs = [], [], []
-    time_begin = time.time()
-    while True:
-        try:
-            np_loss, np_acc, np_num_seqs = exe.run(
-                program=test_program, fetch_list=fetch_list)
-            total_cost.extend(np_loss * np_num_seqs)
-            total_acc.extend(np_acc * np_num_seqs)
-            total_num_seqs.extend(np_num_seqs)
-        except fluid.core.EOFException:
-            test_pyreader.reset()
-            break
-    time_end = time.time()
-    print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" %
-          (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs),
-           np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin))
-def main(args):
-    bert_config = BertConfig(args.bert_config_path)
-    bert_config.print_config()
-    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        place = fluid.CPUPlace()
-        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    task_name = args.task_name.lower()
-    processors = {
-        'xnli': reader.XnliProcessor,
-        'cola': reader.ColaProcessor,
-        'mrpc': reader.MrpcProcessor,
-        'mnli': reader.MnliProcessor,
-        'chnsenticorp': reader.ChnsenticorpProcessor
-    }
-    processor = processors[task_name](
-        data_dir=args.data_dir,
-        vocab_path=args.vocab_path,
-        max_seq_len=args.max_seq_len,
-        do_lower_case=args.do_lower_case,
-        in_tokens=args.in_tokens,
-        random_seed=args.random_seed)
-    num_labels = len(processor.get_labels())
-    startup_prog = fluid.Program()
-    train_program = fluid.Program()
-    with fluid.program_guard(train_program, startup_prog):
-        with fluid.unique_name.guard():
-            src_ids, pos_ids, sent_ids, input_mask, pooled_output, sequence_output = create_bert_module(
-                args,
-                pyreader_name='train_reader',
-                bert_config=bert_config,
-                num_labels=num_labels)
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-            init_pretraining_params(
-                exe,
-                args.init_pretraining_params,
-                main_program=startup_prog,
-                use_fp16=args.use_fp16)
-            pooled_output_sign = hub.create_signature(
-                "pooled_output",
-                inputs=[src_ids, pos_ids, sent_ids, input_mask],
-                outputs=[pooled_output],
-                feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"],
-                fetch_names=["pooled_output"])
-            sequence_output_sign = hub.create_signature(
-                "sequence_output",
-                inputs=[src_ids, pos_ids, sent_ids, input_mask],
-                outputs=[sequence_output],
-                feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"],
-                fetch_names=["sequence_output"])
-            hub.create_module(
-                sign_arr=[pooled_output_sign, sequence_output_sign],
-                module_dir="./chinese_L-12_H-768_A-12.hub_module",
-                exe=exe,
-                assets=[])
-if __name__ == '__main__':
-    print_arguments(args)
-    main(args)
--- a/demo/bert-cls/finetune_with_hub.py
+++ b/demo/bert-cls/finetune_with_hub.py
@@ -89,9 +89,13 @@ if __name__ == '__main__':
        optimizer=None,
        warmup_proportion=args.warmup_proportion)
+    module = hub.Module(
+        module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
+    print("vocab_path = {}".format(module.get_vocab_path()))
    processor = reader.ChnsenticorpProcessor(
        data_dir=args.data_dir,
-        vocab_path=args.vocab_path,
+        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
@@ -100,7 +104,6 @@ if __name__ == '__main__':
    num_labels = len(processor.get_labels())
    # loading paddlehub BERT
-    module = hub.Module(module_dir="./chinese_L-12_H-768_A-12.hub_module")
    # bert's input tensor, output tensor and forward graph
    # If you want to fine-tune the pretrain model parameter, please set

--- a/demo/bert-cls/run_create_module.sh
+++ b/demo/bert-cls/run_create_module.sh
-export FLAGS_enable_parallel_graph=1
-export FLAGS_sync_nccl_allreduce=1
-export CUDA_VISIBLE_DEVICES=0
-BERT_BASE_PATH="chinese_L-12_H-768_A-12"
-TASK_NAME='chnsenticorp'
-DATA_PATH=chnsenticorp_data
-CKPT_PATH=chn_checkpoints
-python -u create_module.py --task_name ${TASK_NAME} \
-                   --use_cuda true \
-                   --do_train true \
-                   --do_val true \
-                   --do_test true \
-                   --batch_size 4096 \
-                   --in_tokens true \
-                   --init_pretraining_params ${BERT_BASE_PATH}/params \
-                   --data_dir ${DATA_PATH} \
-                   --vocab_path ${BERT_BASE_PATH}/vocab.txt \
-                   --checkpoints ${CKPT_PATH} \
-                   --save_steps 100 \
-                   --weight_decay  0.01 \
-                   --warmup_proportion 0.0 \
-                   --validation_steps 50 \
-                   --epoch 3 \
-                   --max_seq_len 128 \
-                   --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
-                   --learning_rate 5e-5 \
-                   --skip_steps 10
--- a/paddle_hub/module/module.py
+++ b/paddle_hub/module/module.py
@@ -98,7 +98,6 @@ class Module:
        self.default_signature = None
        self.module_info = None
        self.processor = None
-        self.assets = []
        self.name = "temp"
        if url:
            self._init_with_url(url=url)
@@ -111,8 +110,8 @@ class Module:
                ), "processor should be sub class of hub.BaseProcessor"
            if assets:
                self.assets = utils.to_list(assets)
-                for asset in assets:
+                # for asset in assets:
-                    utils.check_path(assets)
+                #     utils.check_path(assets)
            self.processor = processor
            self._generate_module_info(module_info)
            self._init_with_signature(signatures=signatures)
@@ -254,6 +253,12 @@ class Module:
            self.__dict__[sign] = functools.partial(
                self.__call__, sign_name=sign)
+    def get_vocab_path(self):
+        for assets_file in self.assets:
+            print(assets_file)
+            if "vocab.txt" in assets_file:
+                return assets_file
    def _recover_from_desc(self):
        # recover signature
        for sign, module_var in self.desc.sign2var.items():