diff --git a/demo/text_classification/predict_predefine_net.py b/demo/text_classification/predict_predefine_net.py new file mode 100644 index 0000000000000000000000000000000000000000..e53cf2b8712f1160abb99e985ca85fb5a4174127 --- /dev/null +++ b/demo/text_classification/predict_predefine_net.py @@ -0,0 +1,99 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import ast +import numpy as np +import os +import time +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +parser.add_argument("--network", type=str, default='bilstm', help="Pre-defined network which was connected after Transformer model, such as ERNIE, BERT ,RoBERTa and ELECTRA.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + dataset = hub.dataset.ChnSentiCorp() + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None + reader = hub.reader.ClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + token_feature = outputs["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_cuda=args.use_gpu, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.AdamWeightDecayStrategy()) + + # Define a classfication finetune task by PaddleHub's API + # network choice: bilstm, bow, cnn, dpcnn, gru, lstm (PaddleHub pre-defined network) + # If you wanna add network after ERNIE/BERT/RoBERTa/ELECTRA module, + # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, + # rather than outputs["pooled_output"], and feature is None + cls_task = hub.TextClassifierTask( + data_reader=reader, + token_feature=token_feature, + feed_list=feed_list, + network=args.network, + num_classes=dataset.num_labels, + config=config) + + # Data to be prdicted + data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], + ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] + + print(cls_task.predict(data=data, return_result=True)) diff --git a/demo/text_classification/run_cls.sh b/demo/text_classification/run_cls.sh new file mode 100644 index 0000000000000000000000000000000000000000..67de03950b2b3dfba70eba509224a04bd5664ce8 --- /dev/null +++ b/demo/text_classification/run_cls.sh @@ -0,0 +1,52 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_chnsenticorp" + +python -u text_cls.py \ + --batch_size=24 \ + --use_gpu=True \ + --checkpoint_dir=${CKPT_DIR} \ + --learning_rate=5e-5 \ + --weight_decay=0.01 \ + --max_seq_len=128 \ + --warmup_proportion=0.1 \ + --num_epoch=3 \ + --use_data_parallel=True + +# The sugguested hyper parameters for difference task +# for ChineseGLUE: +# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# XNLI_zh: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5 +# INEWS: batch_size=4, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5 +# DRCD: see demo: reading-comprehension +# CMRC2018: see demo: reading-comprehension +# BQ: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=100, lr=1e-5 +# MSRANER: see demo: sequence-labeling +# THUCNEWS: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=5e-5 +# IFLYTEKDATA: batch_size=16, weight_decay=0, num_epoch=5, max_seq_len=256, lr=1e-5 + +# for other tasks: +# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5 +# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5 +# QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# CoLA: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# MRPC: batch_size=32, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# RTE: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=3e-5 +# MNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# Specify the matched/mismatched dev and test dataset with an underscore. +# mnli_m or mnli: dev and test in matched dataset. +# mnli_mm: dev and test in mismatched dataset. +# The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf. +# If you are not sure which one to pick, just use mnli or mnli_m. +# XNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# Specify the language with an underscore like xnli_zh. +# ar- Arabic bg- Bulgarian de- German +# el- Greek en- English es- Spanish +# fr- French hi- Hindi ru- Russian +# sw- Swahili th- Thai tr- Turkish +# ur- Urdu vi- Vietnamese zh- Chinese (Simplified) diff --git a/demo/text_classification/run_classifier.sh b/demo/text_classification/run_cls_predefine_net.sh similarity index 94% rename from demo/text_classification/run_classifier.sh rename to demo/text_classification/run_cls_predefine_net.sh index c7bfc95ec9cb899cfb6af36e6fd7bc55094f5edc..f8dab3b999b449790052edccaf19f3d58b6b3192 100644 --- a/demo/text_classification/run_classifier.sh +++ b/demo/text_classification/run_cls_predefine_net.sh @@ -1,9 +1,9 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_chnsenticorp" +CKPT_DIR="./ckpt_chnsenticorp_predefine_net" -python -u text_classifier.py \ +python -u text_cls_predefine_net.py \ --batch_size=24 \ --use_gpu=True \ --checkpoint_dir=${CKPT_DIR} \ @@ -12,7 +12,8 @@ python -u text_classifier.py \ --max_seq_len=128 \ --warmup_proportion=0.1 \ --num_epoch=3 \ - --use_data_parallel=True + --use_data_parallel=True \ + --network=bilstm # The sugguested hyper parameters for difference task # for ChineseGLUE: diff --git a/demo/text_classification/run_predict.sh b/demo/text_classification/run_predict.sh index 5daba18211d544f7a9ba052f634a070fbc8f7cca..c4c0d6002a932fd48d074672624640df131d961e 100644 --- a/demo/text_classification/run_predict.sh +++ b/demo/text_classification/run_predict.sh @@ -3,7 +3,8 @@ export CUDA_VISIBLE_DEVICES=0 CKPT_DIR="./ckpt_chnsenticorp" -python -u predict.py --checkpoint_dir=$CKPT_DIR \ - --max_seq_len=128 \ - --use_gpu=True \ - --batch_size=24 \ +python -u predict.py \ + --checkpoint_dir=$CKPT_DIR \ + --max_seq_len=128 \ + --use_gpu=True \ + --batch_size=24 diff --git a/demo/text_classification/run_predict_predefine_net.sh b/demo/text_classification/run_predict_predefine_net.sh new file mode 100644 index 0000000000000000000000000000000000000000..a29e713f226cc7b92a062defeebd984ab960a0ca --- /dev/null +++ b/demo/text_classification/run_predict_predefine_net.sh @@ -0,0 +1,11 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_chnsenticorp_predefine_net" + +python -u predict_predefine_net.py \ + --checkpoint_dir=$CKPT_DIR \ + --max_seq_len=128 \ + --use_gpu=True \ + --batch_size=24 \ + --network=bilstm diff --git a/demo/text_classification/text_classifier.py b/demo/text_classification/text_cls.py similarity index 100% rename from demo/text_classification/text_classifier.py rename to demo/text_classification/text_cls.py diff --git a/demo/text_classification/text_cls_predefine_net.py b/demo/text_classification/text_cls_predefine_net.py new file mode 100644 index 0000000000000000000000000000000000000000..23746c03e2563ca2696ff0351cb93d73ae17de1f --- /dev/null +++ b/demo/text_classification/text_cls_predefine_net.py @@ -0,0 +1,103 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +import argparse +import ast +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--network", type=str, default='bilstm', help="Pre-defined network which was connected after Transformer model, such as ERNIE, BERT ,RoBERTa and ELECTRA.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + # metric should be acc, f1 or matthews + dataset = hub.dataset.ChnSentiCorp() + metrics_choices = ["acc"] + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None + reader = hub.reader.ClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + token_feature = outputs["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Select finetune strategy, setup config and finetune + strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, + weight_decay=args.weight_decay, + learning_rate=args.learning_rate) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_cuda=args.use_gpu, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + + # Define a classfication finetune task by PaddleHub's API + # network choice: bilstm, bow, cnn, dpcnn, gru, lstm (PaddleHub pre-defined network) + # If you wanna add network after ERNIE/BERT/RoBERTa/ELECTRA module, + # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, + # rather than outputs["pooled_output"], and feature is None + cls_task = hub.TextClassifierTask( + data_reader=reader, + token_feature=token_feature, + feed_list=feed_list, + network=args.network, + num_classes=dataset.num_labels, + config=config, + metrics_choices=metrics_choices) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + cls_task.finetune_and_eval() diff --git a/paddlehub/__init__.py b/paddlehub/__init__.py index a772a0a664e97c04ec7c63ef39f66801e47985b7..2b4721b8132ae3131cee9a8560e0eff5fbdcb164 100644 --- a/paddlehub/__init__.py +++ b/paddlehub/__init__.py @@ -28,6 +28,7 @@ from . import io from . import dataset from . import finetune from . import reader +from . import network from .common.dir import USER_HOME from .common.dir import HUB_HOME diff --git a/paddlehub/finetune/__init__.py b/paddlehub/finetune/__init__.py index 309e3855eb34f06f4dfc50304465742828071db9..be11b01db15040e0997426a9485f2a07d2bf85cf 100644 --- a/paddlehub/finetune/__init__.py +++ b/paddlehub/finetune/__init__.py @@ -1,5 +1,5 @@ -#coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. diff --git a/paddlehub/finetune/task/base_task.py b/paddlehub/finetune/task/base_task.py index 0bae268d62519425ed73ca5f43ddae5305b125ef..b0289ff8f97f9e0d56d0bfe08911c116e0961092 100644 --- a/paddlehub/finetune/task/base_task.py +++ b/paddlehub/finetune/task/base_task.py @@ -36,7 +36,7 @@ from visualdl import LogWriter import paddlehub as hub from paddlehub.common.paddle_helper import dtype_map, clone_program -from paddlehub.common.utils import mkdir, version_compare +from paddlehub.common.utils import mkdir from paddlehub.common.dir import tmp_dir from paddlehub.common.logger import logger from paddlehub.finetune.checkpoint import load_checkpoint, save_checkpoint @@ -992,17 +992,12 @@ class BaseTask(object): Returns: RunState: the running result of predict phase """ - if accelerate_mode: - if not version_compare(paddle.__version__, "1.6.1"): - logger.warning( - "Fail to open predict accelerate mode as it does not support paddle < 1.6.2. Please update PaddlePaddle." - ) - accelerate_mode = False - if isinstance(self._base_data_reader, hub.reader.LACClassifyReader): - logger.warning( - "LACClassifyReader does not support predictor, the accelerate_mode is closed now." - ) - accelerate_mode = False + if accelerate_mode and isinstance(self._base_data_reader, + hub.reader.LACClassifyReader): + logger.warning( + "LACClassifyReader does not support predictor, the accelerate_mode is closed now." + ) + accelerate_mode = False self.accelerate_mode = accelerate_mode with self.phase_guard(phase="predict"): diff --git a/paddlehub/finetune/task/classifier_task.py b/paddlehub/finetune/task/classifier_task.py index 0ab6240583f8be6d58a8d2c1fdcd08bccbb72652..b137afdc750737a6927a981014817d6b6383c36a 100644 --- a/paddlehub/finetune/task/classifier_task.py +++ b/paddlehub/finetune/task/classifier_task.py @@ -17,12 +17,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import time from collections import OrderedDict import numpy as np +import paddle import paddle.fluid as fluid +import time +from paddlehub.common.logger import logger from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef +from paddlehub.reader.nlp_reader import ClassifyReader +import paddlehub.network as net + from .base_task import BaseTask @@ -104,7 +109,7 @@ class ClassifierTask(BaseTask): run_examples += run_state.run_examples run_step += run_state.run_step loss_sum += np.mean( - run_state.run_results[-1]) * run_state.run_examples + run_state.run_results[-2]) * run_state.run_examples acc_sum += np.mean( run_state.run_results[2]) * run_state.run_examples np_labels = run_state.run_results[0] @@ -147,7 +152,7 @@ class ClassifierTask(BaseTask): results = [] for batch_state in run_states: batch_result = batch_state.run_results - batch_infer = np.argmax(batch_result, axis=2)[0] + batch_infer = np.argmax(batch_result[0], axis=1) results += [id2label[sample_infer] for sample_infer in batch_infer] return results @@ -156,21 +161,73 @@ ImageClassifierTask = ClassifierTask class TextClassifierTask(ClassifierTask): + """ + Create a text classification task. + It will use full-connect layer with softmax activation function to classify texts. + """ + def __init__(self, - feature, num_classes, feed_list, data_reader, + feature=None, + token_feature=None, + network=None, startup_program=None, config=None, hidden_units=None, metrics_choices="default"): + """ + Args: + num_classes: total labels of the text classification task. + feed_list(list): the variable name that will be feeded to the main program + data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader. + feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None. + token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. + network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None. + main_program (object): the customized main program, default None. + startup_program (object): the customized startup program, default None. + config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None. + hidden_units(list): the element of `hidden_units` list is the full-connect layer size. It will add the full-connect layers to the program. Default None. + metrics_choices(list): metrics used to the task, default ["acc"]. + """ + if (not feature) and (not token_feature): + logger.error( + 'Both token_feature and feature are None, one of them must be setted.' + ) + exit(1) + elif feature and token_feature: + logger.error( + 'Both token_feature and feature are setted. One should be setted, the other should be None.' + ) + exit(1) + + if network: + assert network in [ + 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru', 'lstm' + ], 'network choice must be one of bilstm, bow, cnn, dpcnn, gru, lstm!' + assert token_feature and ( + not feature + ), 'If you wanna use network, you must set token_feature ranther than feature for TextClassifierTask!' + assert len( + token_feature.shape + ) == 3, 'When you use network, the parameter token_feature must be the token-level feature, such as the sequence_output of ERNIE, BERT, RoBERTa and ELECTRA module.' + else: + assert feature and ( + not token_feature + ), 'If you do not use network, you must set feature ranther than token_feature for TextClassifierTask!' + assert len( + feature.shape + ) == 2, 'When you do not use network, the parameter feture must be the sentence-level feature, such as the pooled_output of ERNIE, BERT, RoBERTa and ELECTRA module.' + + self.network = network if metrics_choices == "default": metrics_choices = ["acc"] + super(TextClassifierTask, self).__init__( data_reader=data_reader, - feature=feature, + feature=feature if feature else token_feature, num_classes=num_classes, feed_list=feed_list, startup_program=startup_program, @@ -179,10 +236,33 @@ class TextClassifierTask(ClassifierTask): metrics_choices=metrics_choices) def _build_net(self): - cls_feats = fluid.layers.dropout( - x=self.feature, - dropout_prob=0.1, - dropout_implementation="upscale_in_train") + if isinstance(self._base_data_reader, ClassifyReader): + # ClassifyReader will return the seqence length of an input text + self.seq_len = fluid.layers.data( + name="seq_len", shape=[1], dtype='int64', lod_level=0) + self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) + + # unpad the token_feature + unpad_feature = fluid.layers.sequence_unpad( + self.feature, length=self.seq_len_used) + + if self.network: + # add pre-defined net + net_func = getattr(net.classification, self.network) + if self.network == 'dpcnn': + # deepcnn network is no need to unpad + cls_feats = net_func( + self.feature, emb_dim=self.feature.shape[-1]) + else: + cls_feats = net_func(unpad_feature) + logger.info( + "%s has been added in the TextClassifierTask!" % self.network) + else: + # not use pre-defined net but to use fc net + cls_feats = fluid.layers.dropout( + x=self.feature, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") if self.hidden_units is not None: for n_hidden in self.hidden_units: @@ -204,6 +284,33 @@ class TextClassifierTask(ClassifierTask): return [logits] + @property + def feed_list(self): + feed_list = [varname for varname in self._base_feed_list] + if isinstance(self._base_data_reader, ClassifyReader): + # ClassifyReader will return the seqence length of an input text + feed_list += [self.seq_len.name] + if self.is_train_phase or self.is_test_phase: + feed_list += [self.labels[0].name] + return feed_list + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + fetch_list = [ + self.labels[0].name, self.ret_infers.name, self.metrics[0].name, + self.loss.name + ] + else: + # predict phase + fetch_list = [self.outputs[0].name] + + if isinstance(self._base_data_reader, ClassifyReader): + # to avoid save_inference_model to prune seq_len variable + fetch_list += [self.seq_len.name] + + return fetch_list + class MultiLabelClassifierTask(ClassifierTask): def __init__(self, diff --git a/paddlehub/finetune/task/sequence_task.py b/paddlehub/finetune/task/sequence_task.py index 372eb3b218b23a578ea80b14c9da856829000598..ac46c990a2cf990f5c9cf7e9bbde3cfc9ae1f270 100644 --- a/paddlehub/finetune/task/sequence_task.py +++ b/paddlehub/finetune/task/sequence_task.py @@ -66,11 +66,7 @@ class SequenceLabelTask(BaseTask): def _build_net(self): self.seq_len = fluid.layers.data( name="seq_len", shape=[1], dtype='int64', lod_level=0) - - if version_compare(paddle.__version__, "1.6"): - self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) - else: - self.seq_len_used = self.seq_len + self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) if self.add_crf: unpad_feature = fluid.layers.sequence_unpad( diff --git a/paddlehub/network/__init__.py b/paddlehub/network/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f89112f4e09c8ba4319105379c9a4efa4c9d6f38 --- /dev/null +++ b/paddlehub/network/__init__.py @@ -0,0 +1,15 @@ +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import classification diff --git a/paddlehub/network/classification.py b/paddlehub/network/classification.py new file mode 100644 index 0000000000000000000000000000000000000000..8543d0f091bb21279d8e7fbd396f81f44f5244f7 --- /dev/null +++ b/paddlehub/network/classification.py @@ -0,0 +1,138 @@ +# coding:utf-8 +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provide nets for text classification +""" + +import paddle +import paddle.fluid as fluid + + +def bilstm(token_embeddings, hid_dim=128, hid_dim2=96): + """ + bilstm net + """ + fc0 = fluid.layers.fc(input=token_embeddings, size=hid_dim * 4) + rfc0 = fluid.layers.fc(input=token_embeddings, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + rlstm_h, c = fluid.layers.dynamic_lstm( + input=rfc0, size=hid_dim * 4, is_reverse=True) + lstm_last = fluid.layers.sequence_last_step(input=lstm_h) + rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) + lstm_last_tanh = fluid.layers.tanh(lstm_last) + rlstm_last_tanh = fluid.layers.tanh(rlstm_last) + + # concat layer + lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) + # full connect layer + fc = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh') + return fc + + +def bow(token_embeddings, hid_dim=128, hid_dim2=96): + """ + bow net + """ + # bow layer + bow = fluid.layers.sequence_pool(input=token_embeddings, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + # full connect layer + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + return fc_2 + + +def cnn(token_embeddings, hid_dim=128, win_size=3): + """ + cnn net + """ + # cnn layer + conv = fluid.nets.sequence_conv_pool( + input=token_embeddings, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max") + # full connect layer + fc_1 = fluid.layers.fc(input=conv, size=hid_dim) + return fc_1 + + +def dpcnn(token_embeddings, + hid_dim=128, + channel_size=250, + emb_dim=1024, + blocks=6): + """ + deepcnn net + """ + + def _block(x): + x = fluid.layers.relu(x) + x = fluid.layers.conv2d(x, channel_size, (3, 1), padding=(1, 0)) + x = fluid.layers.relu(x) + x = fluid.layers.conv2d(x, channel_size, (3, 1), padding=(1, 0)) + return x + + emb = fluid.layers.unsqueeze(token_embeddings, axes=[1]) + region_embedding = fluid.layers.conv2d( + emb, channel_size, (3, emb_dim), padding=(1, 0)) + conv_features = _block(region_embedding) + conv_features = conv_features + region_embedding + # multi-cnn layer + for i in range(blocks): + block_features = fluid.layers.pool2d( + conv_features, + pool_size=(3, 1), + pool_stride=(2, 1), + pool_padding=(1, 0)) + conv_features = _block(block_features) + conv_features = block_features + conv_features + features = fluid.layers.pool2d(conv_features, global_pooling=True) + features = fluid.layers.squeeze(features, axes=[2, 3]) + # full connect layer + fc_1 = fluid.layers.fc(input=features, size=hid_dim, act="tanh") + return fc_1 + + +def gru(token_embeddings, hid_dim=128, hid_dim2=96): + """ + gru net + """ + fc0 = fluid.layers.fc(input=token_embeddings, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + return fc1 + + +def lstm(token_embeddings, hid_dim=128, hid_dim2=96): + """ + lstm net + """ + # lstm layer + fc0 = fluid.layers.fc(input=token_embeddings, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + + # max pooling layer + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + + # full connect layer + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + return fc1 diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index 9eec98125ae841fb9e107954b202ea80b82bad27..7cf4cf67ce70de850ca0c04dc4cc37d58ee8c6f2 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -65,7 +65,6 @@ class BaseNLPReader(BaseReader): logger.warning( "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now." ) - self.task_id = 0 self.Record_With_Label_Id = namedtuple( 'Record', @@ -272,11 +271,12 @@ class ClassifyReader(BaseNLPReader): batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records] - padded_token_ids, input_mask = pad_batch_data( + padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( batch_token_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id, - return_input_mask=True) + return_input_mask=True, + return_seq_lens=True) padded_text_type_ids = pad_batch_data( batch_text_type_ids, max_seq_len=self.max_seq_len, @@ -286,36 +286,16 @@ class ClassifyReader(BaseNLPReader): max_seq_len=self.max_seq_len, pad_idx=self.pad_id) + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask, batch_seq_lens + ] if phase != "predict": batch_labels = [record.label_id for record in batch_records] batch_labels = np.array(batch_labels).astype("int64").reshape( [-1, 1]) + return_list += [batch_labels] - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_labels - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_labels - ] - else: - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids - ] return return_list @@ -369,40 +349,20 @@ class SequenceLabelReader(BaseNLPReader): max_seq_len=self.max_seq_len, pad_idx=self.pad_id) + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask + ] if phase != "predict": batch_label_ids = [record.label_id for record in batch_records] padded_label_ids = pad_batch_data( batch_label_ids, max_seq_len=self.max_seq_len, pad_idx=len(self.label_map) - 1) - - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_label_ids, batch_seq_lens - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, padded_label_ids, - batch_seq_lens - ] + return_list += [padded_label_ids, batch_seq_lens] else: - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_seq_lens - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_seq_lens - ] + return_list += [batch_seq_lens] return return_list @@ -514,37 +474,18 @@ class MultiLabelClassifyReader(BaseNLPReader): max_seq_len=self.max_seq_len, pad_idx=self.pad_id) + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask + ] if phase != "predict": batch_labels_ids = [record.label_id for record in batch_records] num_label = len(self.dataset.get_labels()) batch_labels = np.array(batch_labels_ids).astype("int64").reshape( [-1, num_label]) - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_labels - ] + return_list += [batch_labels] - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_labels - ] - else: - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids - ] return return_list def _convert_example_to_record(self, @@ -634,37 +575,17 @@ class RegressionReader(BaseNLPReader): max_seq_len=self.max_seq_len, pad_idx=self.pad_id) + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask + ] if phase != "predict": batch_labels = [record.label_id for record in batch_records] # the only diff with ClassifyReader: astype("float32") batch_labels = np.array(batch_labels).astype("float32").reshape( [-1, 1]) - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_labels - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_labels - ] - else: - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask - ] - - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids - ] + return_list += [batch_labels] return return_list @@ -831,6 +752,10 @@ class ReadingComprehensionReader(BaseNLPReader): pad_idx=self.pad_id, max_seq_len=self.max_seq_len) + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask, batch_unique_ids + ] if phase != "predict": batch_start_position = [ record.start_position for record in batch_records @@ -843,33 +768,8 @@ class ReadingComprehensionReader(BaseNLPReader): batch_end_position = np.array(batch_end_position).astype( "int64").reshape([-1, 1]) - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_unique_ids, batch_start_position, - batch_end_position - ] + return_list += [batch_start_position, batch_end_position] - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_unique_ids, - batch_start_position, batch_end_position - ] - - else: - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, batch_unique_ids - ] - if self.use_task_id: - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - return_list = [ - padded_token_ids, padded_position_ids, padded_text_type_ids, - input_mask, padded_task_ids, batch_unique_ids - ] return return_list def _prepare_batch_data(self, records, batch_size, phase=None):