提交 9e2c2b34 编写于 作者: Z Zeyu Chen

remove useless code of ernie finetuning

上级 e0de0d86
...@@ -36,7 +36,6 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training ...@@ -36,7 +36,6 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
...@@ -55,9 +54,7 @@ if __name__ == '__main__': ...@@ -55,9 +54,7 @@ if __name__ == '__main__':
# loading Paddlehub BERT # loading Paddlehub BERT
module = hub.Module(module_dir=args.hub_module_dir) module = hub.Module(module_dir=args.hub_module_dir)
# Use BERTTokenizeReader to tokenize the dataset according to model's reader = hub.reader.ClassifyReader(
# vocabulary
reader = hub.reader.BERTTokenizeReader(
dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset
vocab_path=module.get_vocab_path(), vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len) max_seq_len=args.max_seq_len)
......
export CUDA_VISIBLE_DEVICES=5 export CUDA_VISIBLE_DEVICES=3
DATA_PATH=./chnsenticorp_data
HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
#HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
CKPT_DIR="./ckpt" CKPT_DIR="./ckpt"
#rm -rf $CKPT_DIR
python -u finetune_with_hub.py \ python -u finetune_with_hub.py \
--batch_size 32 \ --batch_size 32 \
--hub_module_dir=$HUB_MODULE_DIR \
--data_dir ${DATA_PATH} \
--weight_decay 0.01 \ --weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \ --checkpoint_dir $CKPT_DIR \
--num_epoch 3 \ --num_epoch 3 \
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification tasks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
strategy = hub.BERTFinetuneStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
warmup_strategy="linear_warmup_decay",
)
config = hub.RunConfig(
eval_interval=100,
use_cuda=True,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
strategy=strategy)
# loading Paddlehub ERNIE
module = hub.Module(name="ernie")
reader = hub.reader.SequenceLabelReader(
dataset=hub.dataset.MSRA_NER(),
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
num_labels = len(reader.get_labels())
input_dict, output_dict, program = module.context(
sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len)
with fluid.program_guard(program):
label = fluid.layers.data(
name="label", shape=[args.max_seq_len, 1], dtype='int64')
seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64')
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
sequence_output = output_dict["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of bert's module need
feed_list = [
input_dict["input_ids"].name, input_dict["position_ids"].name,
input_dict["segment_ids"].name, input_dict["input_mask"].name,
label.name, seq_len
]
# Define a classfication finetune task by PaddleHub's API
seq_label_task = hub.append_sequence_labeler(
feature=sequence_output,
labels=label,
seq_len=seq_len,
num_classes=num_labels)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=seq_label_task,
data_reader=reader,
feed_list=feed_list,
config=config)
export CUDA_VISIBLE_DEVICES=3
CKPT_DIR="./ckpt"
python -u finetune_with_hub.py \
--batch_size 16 \
--weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \
--num_epoch 3 \
--max_seq_len 256 \
--learning_rate 5e-5
...@@ -33,6 +33,7 @@ from .module.manager import default_module_manager ...@@ -33,6 +33,7 @@ from .module.manager import default_module_manager
from .io.type import DataType from .io.type import DataType
from .finetune.network import append_mlp_classifier from .finetune.network import append_mlp_classifier
from .finetune.network import append_sequence_labeler
from .finetune.finetune import finetune_and_eval from .finetune.finetune import finetune_and_eval
from .finetune.config import RunConfig from .finetune.config import RunConfig
from .finetune.task import Task from .finetune.task import Task
......
...@@ -30,6 +30,8 @@ class MSRA_NER(object): ...@@ -30,6 +30,8 @@ class MSRA_NER(object):
self._load_label_map() self._load_label_map()
self._load_train_examples() self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_label_map(self): def _load_label_map(self):
self.label_map_file = os.path.join(self.dataset_dir, "label_map.json") self.label_map_file = os.path.join(self.dataset_dir, "label_map.json")
...@@ -40,12 +42,28 @@ class MSRA_NER(object): ...@@ -40,12 +42,28 @@ class MSRA_NER(object):
train_file = os.path.join(self.dataset_dir, "train.tsv") train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(train_file) self.train_examples = self._read_tsv(train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self): def get_train_examples(self):
return self.train_examples return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self): def get_labels(self):
"""See base class.""" return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
return ["0", "1"]
def get_label_map(self):
return self.label_map
def _read_tsv(self, input_file, quotechar=None): def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
......
...@@ -22,6 +22,7 @@ import multiprocessing ...@@ -22,6 +22,7 @@ import multiprocessing
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.finetune.strategy import BERTFinetuneStrategy, DefaultStrategy from paddlehub.finetune.strategy import BERTFinetuneStrategy, DefaultStrategy
...@@ -61,7 +62,149 @@ def _do_memory_optimization(task, config): ...@@ -61,7 +62,149 @@ def _do_memory_optimization(task, config):
(lower_mem, upper_mem, unit)), (lower_mem, upper_mem, unit)),
def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False): def _finetune_seq_label_task(task,
data_reader,
feed_list,
config=None,
do_eval=False):
"""
Finetune sequence labeling task, evaluate metric is F1, precision and recall
"""
main_program = task.main_program()
startup_program = task.startup_program()
loss = task.variable("loss")
seq_len = task.variable("seq_len")
num_epoch = config.num_epoch
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
# Select strategy
if isinstance(config.strategy, hub.BERTFinetuneStrategy):
scheduled_lr = config.strategy.execute(loss, main_program,
data_reader, config)
elif isinstance(config.strategy, hub.DefaultStrategy):
config.strategy.execute(loss)
#TODO: add more finetune strategy
_do_memory_optimization(task, config)
# Try to restore model training checkpoint
current_epoch, global_step = load_checkpoint(config.checkpoint_dir, exe)
train_time_used = 0
logger.info("PaddleHub finetune start")
# Finetune loop
for epoch in range(current_epoch, num_epoch + 1):
train_reader = data_reader.data_generator(
batch_size=batch_size, phase='train')
num_trained_examples = loss_sum = 0
for batch in train_reader():
num_batch_examples = len(batch)
train_time_begin = time.time()
loss_v = exe.run(
feed=data_feeder.feed(batch), fetch_list=[loss.name])
train_time_used += time.time() - train_time_begin
global_step += 1
num_trained_examples += num_batch_examples
loss_sum += loss_v[0] * num_batch_examples
# log fintune status
if global_step % config.log_interval == 0:
avg_loss = loss_sum / num_trained_examples
speed = config.log_interval / train_time_used
logger.info("step %d: loss=%.5f [step/sec: %.2f]" %
(global_step, avg_loss, speed))
train_time_used = 0
num_trained_examples = loss_sum = 0
if config.save_ckpt_interval and global_step % config.save_ckpt_interval == 0:
# NOTE: current saved checkpoint machanism is not completed,
# it can't restore correct dataset training status
save_checkpoint(
checkpoint_dir=config.checkpoint_dir,
current_epoch=epoch,
global_step=global_step,
exe=exe)
if do_eval and global_step % config.eval_interval == 0:
evaluate_seq_label(
task,
data_reader,
feed_list,
phase="dev",
config=config)
evaluate_seq_label(
task,
data_reader,
feed_list,
phase="test",
config=config)
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
save_checkpoint(
checkpoint_dir=config.checkpoint_dir,
current_epoch=num_epoch + 1,
global_step=global_step,
exe=exe)
if do_eval:
evaluate_seq_label(
task, data_reader, feed_list, phase="test", config=config)
logger.info("PaddleHub finetune finished.")
def evaluate_seq_label(task, data_reader, feed_list, phase="test", config=None):
fetch_list = [
task.variable("labels").name,
task.variable("infers").name,
task.variable("seq_len").name,
task.variable("loss").name
]
logger.info("Evaluation on {} dataset start".format(phase))
inference_program = task.inference_program()
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
exe = fluid.Executor(place=place)
with fluid.program_guard(inference_program):
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
num_eval_examples = acc_sum = loss_sum = 0
test_reader = data_reader.data_generator(
batch_size=batch_size, phase=phase)
eval_time_begin = time.time()
eval_step = 0
total_label, total_infer, total_correct = 0.0, 0.0, 0.0
for batch in test_reader():
num_batch_examples = len(batch)
eval_step += 1
np_labels, np_infers, np_lens, _ = exe.run(
feed=data_feeder.feed(batch), fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, 7, dev_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
eval_time_used = time.time() - eval_time_begin
eval_speed = eval_step / eval_time_used
logger.info(
"[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]"
% (phase, f1, precision, recall, eval_speed))
def _finetune_cls_task(task, data_reader, feed_list, config=None,
do_eval=False):
main_program = task.main_program() main_program = task.main_program()
startup_program = task.startup_program() startup_program = task.startup_program()
loss = task.variable("loss") loss = task.variable("loss")
...@@ -175,11 +318,15 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False): ...@@ -175,11 +318,15 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
def finetune_and_eval(task, data_reader, feed_list, config=None): def finetune_and_eval(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=True) if task.task_type == "sequence_labeling":
_finetune_seq_label_task(
task, data_reader, feed_list, config, do_eval=True)
else:
_finetune_cls_task(task, data_reader, feed_list, config, do_eval=True)
def finetune(task, data_reader, feed_list, config=None): def finetune(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=False) _finetune_cls_task(task, data_reader, feed_list, config, do_eval=False)
def evaluate(task, data_reader, feed_list, phase="test", config=None): def evaluate(task, data_reader, feed_list, phase="test", config=None):
...@@ -217,3 +364,107 @@ def evaluate(task, data_reader, feed_list, phase="test", config=None): ...@@ -217,3 +364,107 @@ def evaluate(task, data_reader, feed_list, phase="test", config=None):
(phase, avg_loss, avg_acc, eval_speed)) (phase, avg_loss, avg_acc, eval_speed))
return avg_loss, avg_acc, eval_speed return avg_loss, avg_acc, eval_speed
# Sequence label evaluation functions
def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
def extract_bio_chunk(seq):
chunks = []
cur_chunk = None
null_index = tag_num - 1
for index in range(len(seq)):
tag = seq[index]
tag_type = tag // 2
tag_pos = tag % 2
if tag == null_index:
if cur_chunk is not None:
chunks.append(cur_chunk)
cur_chunk = None
continue
if tag_pos == 0:
if cur_chunk is not None:
chunks.append(cur_chunk)
cur_chunk = {}
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
else:
if cur_chunk is None:
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
continue
if cur_chunk["type"] == tag_type:
cur_chunk["en"] = index + 1
else:
chunks.append(cur_chunk)
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
if cur_chunk is not None:
chunks.append(cur_chunk)
return chunks
null_index = tag_num - 1
num_label = 0
num_infer = 0
num_correct = 0
labels = np_labels.reshape([-1]).astype(np.int32).tolist()
infers = np_infers.reshape([-1]).astype(np.int32).tolist()
all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()
base_index = 0
for dev_index in range(dev_count):
lens = all_lens[dev_index]
max_len = 0
for l in lens:
max_len = max(max_len, l)
for i in range(len(lens)):
seq_st = base_index + i * max_len + 1
seq_en = seq_st + (lens[i] - 2)
infer_chunks = extract_bio_chunk(infers[seq_st:seq_en])
label_chunks = extract_bio_chunk(labels[seq_st:seq_en])
num_infer += len(infer_chunks)
num_label += len(label_chunks)
infer_index = 0
label_index = 0
while label_index < len(label_chunks) \
and infer_index < len(infer_chunks):
if infer_chunks[infer_index]["st"] \
< label_chunks[label_index]["st"]:
infer_index += 1
elif infer_chunks[infer_index]["st"] \
> label_chunks[label_index]["st"]:
label_index += 1
else:
if infer_chunks[infer_index]["en"] \
== label_chunks[label_index]["en"] \
and infer_chunks[infer_index]["type"] \
== label_chunks[label_index]["type"]:
num_correct += 1
infer_index += 1
label_index += 1
base_index += max_len * len(lens)
return num_label, num_infer, num_correct
def calculate_f1(num_label, num_infer, num_correct):
if num_infer == 0:
precision = 0.0
else:
precision = num_correct * 1.0 / num_infer
if num_label == 0:
recall = 0.0
else:
recall = num_correct * 1.0 / num_label
if num_correct == 0:
f1 = 0.0
else:
f1 = 2 * precision * recall / (precision + recall)
return precision, recall, f1
...@@ -55,7 +55,6 @@ def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None): ...@@ -55,7 +55,6 @@ def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None):
accuracy = fluid.layers.accuracy( accuracy = fluid.layers.accuracy(
input=probs, label=label, total=num_example) input=probs, label=label, total=num_example)
# TODO: encapsulate to Task
graph_var_dict = { graph_var_dict = {
"loss": loss, "loss": loss,
"probs": probs, "probs": probs,
...@@ -77,5 +76,40 @@ def append_mlp_multi_classifier(feature, ...@@ -77,5 +76,40 @@ def append_mlp_multi_classifier(feature,
pass pass
def append_sequence_labler(feature, label): def append_sequence_labeler(feature, labels, seq_len, num_classes=None):
pass logits = fluid.layers.fc(
input=feature,
size=num_classes,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(logits, axis=2), shape=[-1, 1])
labels = fluid.layers.flatten(labels, axis=2)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=fluid.layers.flatten(logits, axis=2),
label=labels,
return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
# accuracy = fluid.layers.accuracy(
# input=probs, label=labels, total=num_example)
graph_var_dict = {
"loss": loss,
"probs": probs,
"labels": ret_labels,
"infers": ret_infers,
"seq_len": seq_len
}
task = Task("sequence_labeling", graph_var_dict,
fluid.default_main_program(), fluid.default_startup_program())
return task
...@@ -13,3 +13,5 @@ ...@@ -13,3 +13,5 @@
# limitations under the License. # limitations under the License.
from .nlp_reader import BERTTokenizeReader from .nlp_reader import BERTTokenizeReader
from .task_reader import ClassifyReader
from .task_reader import SequenceLabelReader
...@@ -20,63 +20,8 @@ from __future__ import print_function ...@@ -20,63 +20,8 @@ from __future__ import print_function
import numpy as np import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts, def prepare_batch_data(insts,
total_token_num, total_token_num,
voc_size=0,
max_seq_len=128, max_seq_len=128,
pad_id=None, pad_id=None,
cls_id=None, cls_id=None,
...@@ -103,17 +48,7 @@ def prepare_batch_data(insts, ...@@ -103,17 +48,7 @@ def prepare_batch_data(insts,
labels = np.array(labels).astype("int64").reshape([-1, 1]) labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels) labels_list.append(labels)
# First step: do mask without padding out = batch_src_ids
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding # Second step: padding
src_id, self_input_mask = pad_batch_data( src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True) out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True)
...@@ -130,12 +65,7 @@ def prepare_batch_data(insts, ...@@ -130,12 +65,7 @@ def prepare_batch_data(insts,
return_pos=False, return_pos=False,
return_input_mask=False) return_input_mask=False)
if mask_id >= 0: return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0] return return_list if len(return_list) > 1 else return_list[0]
...@@ -146,7 +76,8 @@ def pad_batch_data(insts, ...@@ -146,7 +76,8 @@ def pad_batch_data(insts,
return_pos=False, return_pos=False,
return_input_mask=False, return_input_mask=False,
return_max_len=False, return_max_len=False,
return_num_token=False): return_num_token=False,
return_seq_lens=False):
""" """
Pad the instances to the max sequence length in batch, and generate the Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask. corresponding position data and input mask.
...@@ -187,4 +118,8 @@ def pad_batch_data(insts, ...@@ -187,4 +118,8 @@ def pad_batch_data(insts,
num_token += len(inst) num_token += len(inst)
return_list += [num_token] return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype("int64").reshape([-1, 1])]
return return_list if len(return_list) > 1 else return_list[0] return return_list if len(return_list) > 1 else return_list[0]
...@@ -83,20 +83,16 @@ class BERTTokenizeReader(object): ...@@ -83,20 +83,16 @@ class BERTTokenizeReader(object):
def generate_batch_data(self, def generate_batch_data(self,
batch_data, batch_data,
total_token_num, total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True, return_input_mask=True,
return_max_len=False, return_max_len=False,
return_num_token=False): return_num_token=False):
return prepare_batch_data( return prepare_batch_data(
batch_data, batch_data,
total_token_num, total_token_num,
voc_size=-1,
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_id=self.vocab["[PAD]"], pad_id=self.vocab["[PAD]"],
cls_id=self.vocab["[CLS]"], cls_id=self.vocab["[CLS]"],
sep_id=self.vocab["[SEP]"], sep_id=self.vocab["[SEP]"],
mask_id=-1,
return_input_mask=return_input_mask, return_input_mask=return_input_mask,
return_max_len=return_max_len, return_max_len=return_max_len,
return_num_token=return_num_token) return_num_token=return_num_token)
...@@ -166,8 +162,6 @@ class BERTTokenizeReader(object): ...@@ -166,8 +162,6 @@ class BERTTokenizeReader(object):
batch_data = self.generate_batch_data( batch_data = self.generate_batch_data(
batch_data, batch_data,
total_token_num, total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True, return_input_mask=True,
return_max_len=True, return_max_len=True,
return_num_token=False) return_num_token=False)
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Store PaddleHub version string """ """ PaddleHub version string """
hub_version = "0.3.1.alpha" hub_version = "0.3.1.alpha"
module_proto_version = "0.1.0" module_proto_version = "0.1.0"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册