提交 b45479ee 编写于 作者: Z Zeyu Chen

add get_vocab_path api for bert module

上级 fc72dc60
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification tasks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
import paddle_hub as hub
import reader.cls as reader
from model.bert import BertConfig
from model.classifier import create_bert_module
from optimization import optimization
from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params, init_checkpoint
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, None, "Path to training data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("random_seed", int, 0, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("task_name", str, None,
"The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
args = parser.parse_args()
# yapf: enable.
def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase):
test_pyreader.start()
total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time()
while True:
try:
np_loss, np_acc, np_num_seqs = exe.run(
program=test_program, fetch_list=fetch_list)
total_cost.extend(np_loss * np_num_seqs)
total_acc.extend(np_acc * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" %
(eval_phase, np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin))
def main(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
task_name = args.task_name.lower()
processors = {
'xnli': reader.XnliProcessor,
'cola': reader.ColaProcessor,
'mrpc': reader.MrpcProcessor,
'mnli': reader.MnliProcessor,
'chnsenticorp': reader.ChnsenticorpProcessor
}
processor = processors[task_name](
data_dir=args.data_dir,
vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
in_tokens=args.in_tokens,
random_seed=args.random_seed)
num_labels = len(processor.get_labels())
startup_prog = fluid.Program()
train_program = fluid.Program()
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
src_ids, pos_ids, sent_ids, input_mask, pooled_output, sequence_output = create_bert_module(
args,
pyreader_name='train_reader',
bert_config=bert_config,
num_labels=num_labels)
exe = fluid.Executor(place)
exe.run(startup_prog)
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
pooled_output_sign = hub.create_signature(
"pooled_output",
inputs=[src_ids, pos_ids, sent_ids, input_mask],
outputs=[pooled_output],
feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"],
fetch_names=["pooled_output"])
sequence_output_sign = hub.create_signature(
"sequence_output",
inputs=[src_ids, pos_ids, sent_ids, input_mask],
outputs=[sequence_output],
feed_names=["src_ids", "pos_ids", "sent_ids", "input_mask"],
fetch_names=["sequence_output"])
hub.create_module(
sign_arr=[pooled_output_sign, sequence_output_sign],
module_dir="./chinese_L-12_H-768_A-12.hub_module",
exe=exe,
assets=[])
if __name__ == '__main__':
print_arguments(args)
main(args)
......@@ -89,9 +89,13 @@ if __name__ == '__main__':
optimizer=None,
warmup_proportion=args.warmup_proportion)
module = hub.Module(
module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
print("vocab_path = {}".format(module.get_vocab_path()))
processor = reader.ChnsenticorpProcessor(
data_dir=args.data_dir,
vocab_path=args.vocab_path,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
in_tokens=args.in_tokens,
......@@ -100,7 +104,6 @@ if __name__ == '__main__':
num_labels = len(processor.get_labels())
# loading paddlehub BERT
module = hub.Module(module_dir="./chinese_L-12_H-768_A-12.hub_module")
# bert's input tensor, output tensor and forward graph
# If you want to fine-tune the pretrain model parameter, please set
......
export FLAGS_enable_parallel_graph=1
export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=0
BERT_BASE_PATH="chinese_L-12_H-768_A-12"
TASK_NAME='chnsenticorp'
DATA_PATH=chnsenticorp_data
CKPT_PATH=chn_checkpoints
python -u create_module.py --task_name ${TASK_NAME} \
--use_cuda true \
--do_train true \
--do_val true \
--do_test true \
--batch_size 4096 \
--in_tokens true \
--init_pretraining_params ${BERT_BASE_PATH}/params \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 100 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--validation_steps 50 \
--epoch 3 \
--max_seq_len 128 \
--bert_config_path ${BERT_BASE_PATH}/bert_config.json \
--learning_rate 5e-5 \
--skip_steps 10
......@@ -98,7 +98,6 @@ class Module:
self.default_signature = None
self.module_info = None
self.processor = None
self.assets = []
self.name = "temp"
if url:
self._init_with_url(url=url)
......@@ -111,8 +110,8 @@ class Module:
), "processor should be sub class of hub.BaseProcessor"
if assets:
self.assets = utils.to_list(assets)
for asset in assets:
utils.check_path(assets)
# for asset in assets:
# utils.check_path(assets)
self.processor = processor
self._generate_module_info(module_info)
self._init_with_signature(signatures=signatures)
......@@ -254,6 +253,12 @@ class Module:
self.__dict__[sign] = functools.partial(
self.__call__, sign_name=sign)
def get_vocab_path(self):
for assets_file in self.assets:
print(assets_file)
if "vocab.txt" in assets_file:
return assets_file
def _recover_from_desc(self):
# recover signature
for sign, module_var in self.desc.sign2var.items():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册