提交 6d13d7a4 编写于 作者: Z Zeyu Chen

add max seq len tensor shape update for bert/ernie

上级 3831c8ba
......@@ -27,6 +27,7 @@ import paddle.fluid as fluid
import paddle_hub as hub
import reader.cls as reader
import reader.task_reader as task_reader
from utils.args import ArgumentGroup, print_arguments
from paddle_hub.finetune.config import FinetuneConfig
......@@ -36,6 +37,7 @@ parser = argparse.ArgumentParser(__doc__)
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("hub_module_dir", str, None, "PaddleHub module directory")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
......@@ -43,12 +45,10 @@ train_g.add_arg("warmup_proportion", float, 0.1,
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, None, "Path to training data.")
data_g.add_arg("checkpoint_dir", str, None, "Directory to model checkpoint")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
args = parser.parse_args()
# yapf: enable.
......@@ -60,7 +60,7 @@ if __name__ == '__main__':
eval_interval=100,
save_ckpt_interval=200,
use_cuda=True,
checkpoint_dir="./bert_cls_ckpt",
checkpoint_dir=args.checkpoint_dir,
learning_rate=args.learning_rate,
num_epoch=args.epoch,
batch_size=args.batch_size,
......@@ -72,34 +72,31 @@ if __name__ == '__main__':
optimizer=None,
warmup_proportion=args.warmup_proportion)
# loading paddlehub BERT
# module = hub.Module(
# module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
# loading Paddlehub BERT
module = hub.Module(module_dir=args.hub_module_dir)
processor = reader.BERTClassifyReader(
reader = reader.BERTClassifyReader(
data_dir=args.data_dir,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
num_labels = len(processor.get_labels())
num_labels = len(reader.get_labels())
# bert's input tensor, output tensor and forward graph
# If you want to fine-tune the pretrain model parameter, please set
# trainable to True
input_dict, output_dict, train_program = module.context(
sign_name="pooled_output", trainable=True)
input_dict, output_dict, program = module.context(
sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len)
with fluid.program_guard(train_program):
with fluid.program_guard(program):
label = fluid.layers.data(name="label", shape=[1], dtype='int64')
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
pooled_output = output_dict["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of bert's module need
feed_list = [
input_dict["src_ids"].name, input_dict["pos_ids"].name,
input_dict["sent_ids"].name, input_dict["input_mask"].name,
input_dict["input_ids"].name, input_dict["position_ids"].name,
input_dict["segment_ids"].name, input_dict["input_mask"].name,
label.name
]
# Define a classfication finetune task by PaddleHub's API
......@@ -110,6 +107,6 @@ if __name__ == '__main__':
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=cls_task,
data_processor=processor,
data_reader=reader,
feed_list=feed_list,
config=config)
......@@ -148,7 +148,8 @@ def pad_batch_data(insts,
corresponding position data and input mask.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
#max_len = max(len(inst) for inst in insts)
max_len = 50
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
......
......@@ -2,13 +2,18 @@ export CUDA_VISIBLE_DEVICES=5
DATA_PATH=./chnsenticorp_data
rm -rf ./bert_cls_ckpt
#HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
CKPT_DIR="./ckpt"
rm -rf $CKPT_DIR
python -u finetune_with_hub.py \
--batch_size 32 \
--in_tokens false \
--batch_size 64 \
--hub_module_dir=$HUB_MODULE_DIR \
--data_dir ${DATA_PATH} \
--weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \
--warmup_proportion 0.0 \
--epoch 3 \
--max_seq_len 128 \
--max_seq_len 50 \
--learning_rate 5e-5
......@@ -41,8 +41,7 @@ def _get_running_device_info(config):
return place, dev_count
def _finetune_model(task, data_processor, feed_list, config=None,
do_eval=False):
def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
main_program = task.main_program()
startup_program = task.startup_program()
loss = task.variable("loss")
......@@ -52,10 +51,9 @@ def _finetune_model(task, data_processor, feed_list, config=None,
batch_size = config.batch_size
learning_rate = config.learning_rate
with_memory_optimization = config.with_memory_optimization
checkpoint_dir = config.checkpoint_dir
checkpoint_path = os.path.join(checkpoint_dir, CKPT_FILE)
checkpoint_path = os.path.join(config.checkpoint_dir, CKPT_FILE)
log_writter = LogWriter(
os.path.join(checkpoint_dir, "vdllog"), sync_cycle=10)
os.path.join(config.checkpoint_dir, "vdllog"), sync_cycle=10)
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
......@@ -64,7 +62,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
if config.finetune_strategy == "bert_finetune":
scheduled_lr = bert_finetune(task, main_program, data_processor,
scheduled_lr = bert_finetune(task, main_program, data_reader,
config, dev_count)
elif config.optimizer == "adam":
optimizer = fluid.optimizer.Adam(learning_rate=config.learning_rate)
......@@ -112,7 +110,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")
for epoch in range(last_epoch, num_epoch + 1):
train_reader = data_processor.data_generator(
train_reader = data_reader.data_generator(
batch_size=batch_size, phase='train')
num_trained_examples = acc_sum = loss_sum = 0
for batch in train_reader():
......@@ -144,7 +142,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
if global_step % config.save_ckpt_interval == 0:
model_saved_dir = os.path.join(
checkpoint_dir, "model_in_step_%d" % global_step)
config.checkpoint_dir, "model_in_step_%d" % global_step)
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed,
# it can't restore dataset training status
......@@ -157,7 +155,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
if do_eval and global_step % config.eval_interval == 0:
eval_loss, eval_acc, eval_perf = evaluate(
task,
data_processor,
data_reader,
feed_list,
phase="val",
config=config)
......@@ -165,7 +163,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
eval_acc_scalar.add_record(global_step, eval_acc)
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
model_saved_dir = os.path.join(checkpoint_dir,
model_saved_dir = os.path.join(config.checkpoint_dir,
"best_model")
logger.info(
"best model saved to %s [best accuracy=%.5f]" %
......@@ -173,7 +171,7 @@ def _finetune_model(task, data_processor, feed_list, config=None,
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# update model and checkpoint
model_saved_dir = os.path.join(checkpoint_dir, "final_model")
model_saved_dir = os.path.join(config.checkpoint_dir, "final_model")
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
......@@ -184,20 +182,19 @@ def _finetune_model(task, data_processor, feed_list, config=None,
last_model_dir=model_saved_dir)
if do_eval:
evaluate(
task, data_processor, feed_list, phase="test", config=config)
evaluate(task, data_reader, feed_list, phase="test", config=config)
logger.info("PaddleHub finetune finished.")
def finetune_and_eval(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, do_eval=True)
def finetune_and_eval(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=True)
def finetune(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, do_eval=False)
def finetune(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=False)
def evaluate(task, data_processor, feed_list, phase="test", config=None):
def evaluate(task, data_reader, feed_list, phase="test", config=None):
inference_program = task.inference_program()
main_program = task.main_program()
loss = task.variable("loss")
......@@ -208,7 +205,7 @@ def evaluate(task, data_processor, feed_list, phase="test", config=None):
with fluid.program_guard(inference_program):
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
num_eval_examples = acc_sum = loss_sum = 0
test_reader = data_processor.data_generator(
test_reader = data_reader.data_generator(
batch_size=batch_size, phase=phase)
eval_time_begin = time.time()
eval_step = 0
......
......@@ -37,6 +37,11 @@ import paddle.fluid as fluid
__all__ = ['Module', 'create_module']
def set_max_seq_len(program, input_dict):
""" Set """
pass
def create_module(sign_arr,
module_dir,
processor=None,
......@@ -62,7 +67,7 @@ PROCESSOR_NAME = "processor"
HUB_VAR_PREFIX = "@HUB_%s@"
class ModuleHelper:
class ModuleHelper(object):
def __init__(self, module_dir):
self.module_dir = module_dir
......@@ -82,7 +87,7 @@ class ModuleHelper:
return os.path.join(self.module_dir, ASSETS_DIRNAME)
class Module:
class Module(object):
def __init__(self,
url=None,
module_dir=None,
......@@ -116,7 +121,7 @@ class Module:
self._generate_module_info(module_info)
self._init_with_signature(signatures=signatures)
else:
raise "Error! HubModule Can't init with nothing"
raise "Error! HubModule can't init with nothing"
def _init_with_url(self, url):
utils.check_url(url)
......@@ -405,7 +410,13 @@ class Module:
for_test=False,
trainable=False,
regularizer=None,
max_seq_len=128,
learning_rate=1e-3):
"""
Args:
max_seq_len(int): maximum sequence length, this option is only
available for BERT/ERNIE module
"""
assert sign_name in self.signatures, "module did not have a signature with name %s" % sign_name
signature = self.signatures[sign_name]
......@@ -444,11 +455,32 @@ class Module:
if key:
fetch_dict[key] = program.global_block().var(var.name)
# TODO(ZeyuChen) encapsulate into a funtion
# update BERT/ERNIE's input tensor's sequence length to max_seq_len
if self.name.startswith("bert") or self.name.startswith("ernie"):
print("module_name", self.name)
MAX_SEQ_LENGTH = 512
if max_seq_len > MAX_SEQ_LENGTH or max_seq_len <= 0:
raise ValueError(
"max_seq_len({}) should be in the range of [1, {}]".format(
MAX_SEQ_LENGTH))
logger.info(
"update maximum sequence length of input tensor to {}".format(
max_seq_len))
for tensor_name in [
"input_ids", "position_ids", "segment_ids", "input_mask"
]:
seq_tensor_shape = [-1, max_seq_len, 1]
logger.info("The shape of input tensor[{}] set to {}".format(
tensor_name, seq_tensor_shape))
program.global_block().var(
feed_dict[tensor_name].name).desc.set_shape(
seq_tensor_shape)
# record num parameters loaded by paddlehub
num_param_loaded = 0
for param in program.global_block().iter_parameters():
num_param_loaded += 1
# logger.debug("%s %s" % (param.name, param.optimize_attr))
logger.info(
"%d pretrained paramaters loaded by PaddleHub" % num_param_loaded)
......
......@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Store PaddleHub version string """
hub_version = "0.2.1.alpha"
hub_version = "0.3.0.alpha"
module_proto_version = "0.1.0"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册