diff --git a/demo/ernie-classification/finetune_with_hub.py b/demo/ernie-classification/finetune_with_hub.py index 0e82da176ec3c5a3722b767e5f6043e47c872cfe..0e6a284cd698472500f89a861c56d53859940979 100644 --- a/demo/ernie-classification/finetune_with_hub.py +++ b/demo/ernie-classification/finetune_with_hub.py @@ -36,7 +36,6 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") - args = parser.parse_args() # yapf: enable. @@ -55,9 +54,7 @@ if __name__ == '__main__': # loading Paddlehub BERT module = hub.Module(module_dir=args.hub_module_dir) - # Use BERTTokenizeReader to tokenize the dataset according to model's - # vocabulary - reader = hub.reader.BERTTokenizeReader( + reader = hub.reader.ClassifyReader( dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) diff --git a/demo/ernie-classification/run_fintune_with_hub.sh b/demo/ernie-classification/run_fintune_with_hub.sh index 2dc05f17c05e6dce1d61c3cfd1236b60f98787ea..b267e3da28d60738c977820d3b86ad15a5ab0081 100644 --- a/demo/ernie-classification/run_fintune_with_hub.sh +++ b/demo/ernie-classification/run_fintune_with_hub.sh @@ -1,16 +1,8 @@ -export CUDA_VISIBLE_DEVICES=5 +export CUDA_VISIBLE_DEVICES=3 -DATA_PATH=./chnsenticorp_data - - -HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module" -#HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module" CKPT_DIR="./ckpt" -#rm -rf $CKPT_DIR python -u finetune_with_hub.py \ --batch_size 32 \ - --hub_module_dir=$HUB_MODULE_DIR \ - --data_dir ${DATA_PATH} \ --weight_decay 0.01 \ --checkpoint_dir $CKPT_DIR \ --num_epoch 3 \ diff --git a/demo/ernie-seq-label/finetune_with_hub.py b/demo/ernie-seq-label/finetune_with_hub.py new file mode 100644 index 0000000000000000000000000000000000000000..dbab65c0744190134927367d18753604c7a54d85 --- /dev/null +++ b/demo/ernie-seq-label/finetune_with_hub.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") + +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + config = hub.RunConfig( + eval_interval=100, + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + strategy=strategy) + + # loading Paddlehub ERNIE + module = hub.Module(name="ernie") + + reader = hub.reader.SequenceLabelReader( + dataset=hub.dataset.MSRA_NER(), + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + + num_labels = len(reader.get_labels()) + + input_dict, output_dict, program = module.context( + sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) + + with fluid.program_guard(program): + label = fluid.layers.data( + name="label", shape=[args.max_seq_len, 1], dtype='int64') + seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64') + + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + sequence_output = output_dict["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of bert's module need + feed_list = [ + input_dict["input_ids"].name, input_dict["position_ids"].name, + input_dict["segment_ids"].name, input_dict["input_mask"].name, + label.name, seq_len + ] + # Define a classfication finetune task by PaddleHub's API + seq_label_task = hub.append_sequence_labeler( + feature=sequence_output, + labels=label, + seq_len=seq_len, + num_classes=num_labels) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + hub.finetune_and_eval( + task=seq_label_task, + data_reader=reader, + feed_list=feed_list, + config=config) diff --git a/demo/ernie-seq-label/run_fintune_with_hub.sh b/demo/ernie-seq-label/run_fintune_with_hub.sh new file mode 100644 index 0000000000000000000000000000000000000000..50d5ec4e010f993d7a0b18326dae572e11fbcc1a --- /dev/null +++ b/demo/ernie-seq-label/run_fintune_with_hub.sh @@ -0,0 +1,11 @@ +export CUDA_VISIBLE_DEVICES=3 + +CKPT_DIR="./ckpt" + +python -u finetune_with_hub.py \ + --batch_size 16 \ + --weight_decay 0.01 \ + --checkpoint_dir $CKPT_DIR \ + --num_epoch 3 \ + --max_seq_len 256 \ + --learning_rate 5e-5 diff --git a/paddlehub/__init__.py b/paddlehub/__init__.py index e8e4eae719b0d7c2dd779e44bb0ef3cf04861de9..8fce7887c925ba2b09d84aa88e7db304a3dc7026 100644 --- a/paddlehub/__init__.py +++ b/paddlehub/__init__.py @@ -33,6 +33,7 @@ from .module.manager import default_module_manager from .io.type import DataType from .finetune.network import append_mlp_classifier +from .finetune.network import append_sequence_labeler from .finetune.finetune import finetune_and_eval from .finetune.config import RunConfig from .finetune.task import Task diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py index 53a45fd55dec91c82b9c4000d3f288c44affed42..fd091f7ebeca973e7e0cb7d66f7231ce3e9142c0 100644 --- a/paddlehub/dataset/msra_ner.py +++ b/paddlehub/dataset/msra_ner.py @@ -30,6 +30,8 @@ class MSRA_NER(object): self._load_label_map() self._load_train_examples() + self._load_test_examples() + self._load_dev_examples() def _load_label_map(self): self.label_map_file = os.path.join(self.dataset_dir, "label_map.json") @@ -40,12 +42,28 @@ class MSRA_NER(object): train_file = os.path.join(self.dataset_dir, "train.tsv") self.train_examples = self._read_tsv(train_file) + def _load_dev_examples(self): + self.dev_file = os.path.join(self.dataset_dir, "dev.tsv") + self.dev_examples = self._read_tsv(self.dev_file) + + def _load_test_examples(self): + self.test_file = os.path.join(self.dataset_dir, "test.tsv") + self.test_examples = self._read_tsv(self.test_file) + def get_train_examples(self): return self.train_examples + def get_dev_examples(self): + return self.dev_examples + + def get_test_examples(self): + return self.test_examples + def get_labels(self): - """See base class.""" - return ["0", "1"] + return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] + + def get_label_map(self): + return self.label_map def _read_tsv(self, input_file, quotechar=None): """Reads a tab separated value file.""" diff --git a/paddlehub/finetune/finetune.py b/paddlehub/finetune/finetune.py index b724049601ac1bfacc64b5853c2cc4c785affc79..36880dd85cda1880cc50af1a13ec0e8e1660d6d0 100644 --- a/paddlehub/finetune/finetune.py +++ b/paddlehub/finetune/finetune.py @@ -22,6 +22,7 @@ import multiprocessing import paddle import paddle.fluid as fluid +import numpy as np from paddlehub.common.logger import logger from paddlehub.finetune.strategy import BERTFinetuneStrategy, DefaultStrategy @@ -61,7 +62,149 @@ def _do_memory_optimization(task, config): (lower_mem, upper_mem, unit)), -def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False): +def _finetune_seq_label_task(task, + data_reader, + feed_list, + config=None, + do_eval=False): + """ + Finetune sequence labeling task, evaluate metric is F1, precision and recall + + """ + main_program = task.main_program() + startup_program = task.startup_program() + loss = task.variable("loss") + seq_len = task.variable("seq_len") + + num_epoch = config.num_epoch + batch_size = config.batch_size + + place, dev_count = _get_running_device_info(config) + with fluid.program_guard(main_program, startup_program): + exe = fluid.Executor(place=place) + data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + + # Select strategy + if isinstance(config.strategy, hub.BERTFinetuneStrategy): + scheduled_lr = config.strategy.execute(loss, main_program, + data_reader, config) + elif isinstance(config.strategy, hub.DefaultStrategy): + config.strategy.execute(loss) + #TODO: add more finetune strategy + + _do_memory_optimization(task, config) + + # Try to restore model training checkpoint + current_epoch, global_step = load_checkpoint(config.checkpoint_dir, exe) + + train_time_used = 0 + logger.info("PaddleHub finetune start") + + # Finetune loop + for epoch in range(current_epoch, num_epoch + 1): + train_reader = data_reader.data_generator( + batch_size=batch_size, phase='train') + num_trained_examples = loss_sum = 0 + for batch in train_reader(): + num_batch_examples = len(batch) + train_time_begin = time.time() + loss_v = exe.run( + feed=data_feeder.feed(batch), fetch_list=[loss.name]) + train_time_used += time.time() - train_time_begin + global_step += 1 + num_trained_examples += num_batch_examples + loss_sum += loss_v[0] * num_batch_examples + + # log fintune status + if global_step % config.log_interval == 0: + avg_loss = loss_sum / num_trained_examples + speed = config.log_interval / train_time_used + logger.info("step %d: loss=%.5f [step/sec: %.2f]" % + (global_step, avg_loss, speed)) + + train_time_used = 0 + num_trained_examples = loss_sum = 0 + + if config.save_ckpt_interval and global_step % config.save_ckpt_interval == 0: + # NOTE: current saved checkpoint machanism is not completed, + # it can't restore correct dataset training status + save_checkpoint( + checkpoint_dir=config.checkpoint_dir, + current_epoch=epoch, + global_step=global_step, + exe=exe) + + if do_eval and global_step % config.eval_interval == 0: + evaluate_seq_label( + task, + data_reader, + feed_list, + phase="dev", + config=config) + evaluate_seq_label( + task, + data_reader, + feed_list, + phase="test", + config=config) + + # NOTE: current saved checkpoint machanism is not completed, it can't + # resotre dataset training status + save_checkpoint( + checkpoint_dir=config.checkpoint_dir, + current_epoch=num_epoch + 1, + global_step=global_step, + exe=exe) + + if do_eval: + evaluate_seq_label( + task, data_reader, feed_list, phase="test", config=config) + logger.info("PaddleHub finetune finished.") + + +def evaluate_seq_label(task, data_reader, feed_list, phase="test", config=None): + fetch_list = [ + task.variable("labels").name, + task.variable("infers").name, + task.variable("seq_len").name, + task.variable("loss").name + ] + logger.info("Evaluation on {} dataset start".format(phase)) + inference_program = task.inference_program() + batch_size = config.batch_size + place, dev_count = _get_running_device_info(config) + exe = fluid.Executor(place=place) + with fluid.program_guard(inference_program): + data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + num_eval_examples = acc_sum = loss_sum = 0 + test_reader = data_reader.data_generator( + batch_size=batch_size, phase=phase) + eval_time_begin = time.time() + eval_step = 0 + total_label, total_infer, total_correct = 0.0, 0.0, 0.0 + for batch in test_reader(): + num_batch_examples = len(batch) + eval_step += 1 + np_labels, np_infers, np_lens, _ = exe.run( + feed=data_feeder.feed(batch), fetch_list=fetch_list) + label_num, infer_num, correct_num = chunk_eval( + np_labels, np_infers, np_lens, 7, dev_count) + + total_infer += infer_num + total_label += label_num + total_correct += correct_num + + precision, recall, f1 = calculate_f1(total_label, total_infer, + total_correct) + eval_time_used = time.time() - eval_time_begin + eval_speed = eval_step / eval_time_used + logger.info( + "[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]" + % (phase, f1, precision, recall, eval_speed)) + + +def _finetune_cls_task(task, data_reader, feed_list, config=None, + do_eval=False): main_program = task.main_program() startup_program = task.startup_program() loss = task.variable("loss") @@ -175,11 +318,15 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False): def finetune_and_eval(task, data_reader, feed_list, config=None): - _finetune_model(task, data_reader, feed_list, config, do_eval=True) + if task.task_type == "sequence_labeling": + _finetune_seq_label_task( + task, data_reader, feed_list, config, do_eval=True) + else: + _finetune_cls_task(task, data_reader, feed_list, config, do_eval=True) def finetune(task, data_reader, feed_list, config=None): - _finetune_model(task, data_reader, feed_list, config, do_eval=False) + _finetune_cls_task(task, data_reader, feed_list, config, do_eval=False) def evaluate(task, data_reader, feed_list, phase="test", config=None): @@ -217,3 +364,107 @@ def evaluate(task, data_reader, feed_list, phase="test", config=None): (phase, avg_loss, avg_acc, eval_speed)) return avg_loss, avg_acc, eval_speed + + +# Sequence label evaluation functions +def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1): + def extract_bio_chunk(seq): + chunks = [] + cur_chunk = None + null_index = tag_num - 1 + for index in range(len(seq)): + tag = seq[index] + tag_type = tag // 2 + tag_pos = tag % 2 + + if tag == null_index: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = None + continue + + if tag_pos == 0: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = {} + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + else: + if cur_chunk is None: + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + continue + + if cur_chunk["type"] == tag_type: + cur_chunk["en"] = index + 1 + else: + chunks.append(cur_chunk) + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + if cur_chunk is not None: + chunks.append(cur_chunk) + return chunks + + null_index = tag_num - 1 + num_label = 0 + num_infer = 0 + num_correct = 0 + labels = np_labels.reshape([-1]).astype(np.int32).tolist() + infers = np_infers.reshape([-1]).astype(np.int32).tolist() + all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist() + + base_index = 0 + for dev_index in range(dev_count): + lens = all_lens[dev_index] + max_len = 0 + for l in lens: + max_len = max(max_len, l) + + for i in range(len(lens)): + seq_st = base_index + i * max_len + 1 + seq_en = seq_st + (lens[i] - 2) + infer_chunks = extract_bio_chunk(infers[seq_st:seq_en]) + label_chunks = extract_bio_chunk(labels[seq_st:seq_en]) + num_infer += len(infer_chunks) + num_label += len(label_chunks) + + infer_index = 0 + label_index = 0 + while label_index < len(label_chunks) \ + and infer_index < len(infer_chunks): + if infer_chunks[infer_index]["st"] \ + < label_chunks[label_index]["st"]: + infer_index += 1 + elif infer_chunks[infer_index]["st"] \ + > label_chunks[label_index]["st"]: + label_index += 1 + else: + if infer_chunks[infer_index]["en"] \ + == label_chunks[label_index]["en"] \ + and infer_chunks[infer_index]["type"] \ + == label_chunks[label_index]["type"]: + num_correct += 1 + + infer_index += 1 + label_index += 1 + + base_index += max_len * len(lens) + + return num_label, num_infer, num_correct + + +def calculate_f1(num_label, num_infer, num_correct): + if num_infer == 0: + precision = 0.0 + else: + precision = num_correct * 1.0 / num_infer + + if num_label == 0: + recall = 0.0 + else: + recall = num_correct * 1.0 / num_label + + if num_correct == 0: + f1 = 0.0 + else: + f1 = 2 * precision * recall / (precision + recall) + return precision, recall, f1 diff --git a/paddlehub/finetune/network.py b/paddlehub/finetune/network.py index dc3f5c5f319421ba2137cca0791b50658fa43514..5513063f046a987931411344651596538c10bcc7 100644 --- a/paddlehub/finetune/network.py +++ b/paddlehub/finetune/network.py @@ -55,7 +55,6 @@ def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None): accuracy = fluid.layers.accuracy( input=probs, label=label, total=num_example) - # TODO: encapsulate to Task graph_var_dict = { "loss": loss, "probs": probs, @@ -77,5 +76,40 @@ def append_mlp_multi_classifier(feature, pass -def append_sequence_labler(feature, label): - pass +def append_sequence_labeler(feature, labels, seq_len, num_classes=None): + logits = fluid.layers.fc( + input=feature, + size=num_classes, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_seq_label_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_seq_label_out_b", + initializer=fluid.initializer.Constant(0.))) + + ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1]) + ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(logits, axis=2), shape=[-1, 1]) + + labels = fluid.layers.flatten(labels, axis=2) + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=fluid.layers.flatten(logits, axis=2), + label=labels, + return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + # accuracy = fluid.layers.accuracy( + # input=probs, label=labels, total=num_example) + + graph_var_dict = { + "loss": loss, + "probs": probs, + "labels": ret_labels, + "infers": ret_infers, + "seq_len": seq_len + } + + task = Task("sequence_labeling", graph_var_dict, + fluid.default_main_program(), fluid.default_startup_program()) + + return task diff --git a/paddlehub/reader/__init__.py b/paddlehub/reader/__init__.py index 5e6f3c250378b90169817f25147ee65aca4257d9..59f53d685ce788de57c4fb7ca372d297e83ed7da 100644 --- a/paddlehub/reader/__init__.py +++ b/paddlehub/reader/__init__.py @@ -13,3 +13,5 @@ # limitations under the License. from .nlp_reader import BERTTokenizeReader +from .task_reader import ClassifyReader +from .task_reader import SequenceLabelReader diff --git a/paddlehub/reader/batching.py b/paddlehub/reader/batching.py index 3543d82c11e96f59705eb03059f18b53c21272c3..e33b0082627f46bd430c53b06db9cbb2d996edec 100644 --- a/paddlehub/reader/batching.py +++ b/paddlehub/reader/batching.py @@ -20,63 +20,8 @@ from __future__ import print_function import numpy as np -def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): - """ - Add mask for batch_tokens, return out, mask_label, mask_pos; - Note: mask_pos responding the batch_tokens after padded; - """ - max_len = max([len(sent) for sent in batch_tokens]) - mask_label = [] - mask_pos = [] - prob_mask = np.random.rand(total_token_num) - # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) - pre_sent_len = 0 - prob_index = 0 - for sent_index, sent in enumerate(batch_tokens): - mask_flag = False - prob_index += pre_sent_len - for token_index, token in enumerate(sent): - prob = prob_mask[prob_index + token_index] - if prob > 0.15: - continue - elif 0.03 < prob <= 0.15: - # mask - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - elif 0.015 < prob <= 0.03: - # random replace - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + token_index] - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - else: - # keep the original token - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - mask_pos.append(sent_index * max_len + token_index) - pre_sent_len = len(sent) - - # ensure at least mask one word in a sentence - while not mask_flag: - token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) - if sent[token_index] != SEP and sent[token_index] != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) - mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) - return batch_tokens, mask_label, mask_pos - - def prepare_batch_data(insts, total_token_num, - voc_size=0, max_seq_len=128, pad_id=None, cls_id=None, @@ -103,17 +48,7 @@ def prepare_batch_data(insts, labels = np.array(labels).astype("int64").reshape([-1, 1]) labels_list.append(labels) - # First step: do mask without padding - if mask_id >= 0: - out, mask_label, mask_pos = mask( - batch_src_ids, - total_token_num, - vocab_size=voc_size, - CLS=cls_id, - SEP=sep_id, - MASK=mask_id) - else: - out = batch_src_ids + out = batch_src_ids # Second step: padding src_id, self_input_mask = pad_batch_data( out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True) @@ -130,12 +65,7 @@ def prepare_batch_data(insts, return_pos=False, return_input_mask=False) - if mask_id >= 0: - return_list = [ - src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos - ] + labels_list - else: - return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list + return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list return return_list if len(return_list) > 1 else return_list[0] @@ -146,7 +76,8 @@ def pad_batch_data(insts, return_pos=False, return_input_mask=False, return_max_len=False, - return_num_token=False): + return_num_token=False, + return_seq_lens=False): """ Pad the instances to the max sequence length in batch, and generate the corresponding position data and input mask. @@ -187,4 +118,8 @@ def pad_batch_data(insts, num_token += len(inst) return_list += [num_token] + if return_seq_lens: + seq_lens = np.array([len(inst) for inst in insts]) + return_list += [seq_lens.astype("int64").reshape([-1, 1])] + return return_list if len(return_list) > 1 else return_list[0] diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index 5a37a79940eab5bd200fca814a9c9dc0e114b8dc..00bdac4adefeb0cf0a7f1cf2fe3638ff7bbc89c1 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -83,20 +83,16 @@ class BERTTokenizeReader(object): def generate_batch_data(self, batch_data, total_token_num, - voc_size=-1, - mask_id=-1, return_input_mask=True, return_max_len=False, return_num_token=False): return prepare_batch_data( batch_data, total_token_num, - voc_size=-1, max_seq_len=self.max_seq_len, pad_id=self.vocab["[PAD]"], cls_id=self.vocab["[CLS]"], sep_id=self.vocab["[SEP]"], - mask_id=-1, return_input_mask=return_input_mask, return_max_len=return_max_len, return_num_token=return_num_token) @@ -166,8 +162,6 @@ class BERTTokenizeReader(object): batch_data = self.generate_batch_data( batch_data, total_token_num, - voc_size=-1, - mask_id=-1, return_input_mask=True, return_max_len=True, return_num_token=False) diff --git a/paddlehub/version.py b/paddlehub/version.py index 32298f233077e77625a31f1ff9fbca9169d1066d..61c98cc21c95e4098a5aee992863bdc72f91b0be 100644 --- a/paddlehub/version.py +++ b/paddlehub/version.py @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Store PaddleHub version string """ +""" PaddleHub version string """ hub_version = "0.3.1.alpha" module_proto_version = "0.1.0"