From 510f540704bf56287882aa4f88572c499dd0698a Mon Sep 17 00:00:00 2001 From: zhangxuefei Date: Fri, 2 Aug 2019 15:25:57 +0800 Subject: [PATCH] Update text-cls demo and multi-lable cls demo to adopt to ernie v2 --- .../multi_label_classifier.py | 44 ++++++++++------ demo/multi-label-classification/predict.py | 50 ++++++++++++------- .../run_classifier.sh | 3 +- .../multi-label-classification/run_predict.sh | 2 +- paddlehub/module/module.py | 3 +- paddlehub/reader/nlp_reader.py | 7 ++- paddlehub/reader/tokenization.py | 2 +- 7 files changed, 71 insertions(+), 40 deletions(-) diff --git a/demo/multi-label-classification/multi_label_classifier.py b/demo/multi-label-classification/multi_label_classifier.py index e11cc0fc..b1e7086f 100644 --- a/demo/multi-label-classification/multi_label_classifier.py +++ b/demo/multi-label-classification/multi_label_classifier.py @@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") +parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub BERT pretrained model - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + # Load Paddlehub BERT pretrained model + if args.use_taskid: + module = hub.Module(name="ernie_eng_base.hub_module") - inputs, outputs, program = module.context( - trainable=True, max_seq_len=args.max_seq_len) + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Step2: Download dataset and use MultiLabelReader to read dataset + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, + inputs["task_ids"].name + ] + else: + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Download dataset and use MultiLabelReader to read dataset dataset = hub.dataset.Toxic() reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) + max_seq_len=args.max_seq_len, + use_task_id=args.use_taskid) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, diff --git a/demo/multi-label-classification/predict.py b/demo/multi-label-classification/predict.py index 138c5ade..6a6ef240 100644 --- a/demo/multi-label-classification/predict.py +++ b/demo/multi-label-classification/predict.py @@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # loading Paddlehub ERNIE pretrained model - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) + # Load Paddlehub BERT pretrained model + if args.use_taskid: + module = hub.Module(name="ernie_eng_base.hub_module") - # Sentence classification dataset reader + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, + inputs["task_ids"].name + ] + else: + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Download dataset and use MultiLabelReader to read dataset dataset = hub.dataset.Toxic() - num_label = len(dataset.get_labels()) reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) - - place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) + max_seq_len=args.max_seq_len, + use_task_id=args.use_taskid) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, @@ -104,7 +116,7 @@ if __name__ == '__main__': for result in results: # get predict index label_ids = [] - for i in range(num_label): + for i in range(dataset.num_labels): label_val = np.argmax(result[i]) label_ids.append(label_val) print("%s\tpredict=%s" % (data[index][0], label_ids)) diff --git a/demo/multi-label-classification/run_classifier.sh b/demo/multi-label-classification/run_classifier.sh index 93b88833..f08026a4 100644 --- a/demo/multi-label-classification/run_classifier.sh +++ b/demo/multi-label-classification/run_classifier.sh @@ -16,4 +16,5 @@ python -u multi_label_classifier.py \ --learning_rate=5e-5 \ --weight_decay=0.01 \ --max_seq_len=128 \ - --num_epoch=3 + --num_epoch=3 \ + --use_taskid=False diff --git a/demo/multi-label-classification/run_predict.sh b/demo/multi-label-classification/run_predict.sh index ea28d8d9..f0976fe1 100644 --- a/demo/multi-label-classification/run_predict.sh +++ b/demo/multi-label-classification/run_predict.sh @@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 CKPT_DIR="./ckpt_toxic" -python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True +python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True --use_taskid False diff --git a/paddlehub/module/module.py b/paddlehub/module/module.py index e218222c..fce447b1 100644 --- a/paddlehub/module/module.py +++ b/paddlehub/module/module.py @@ -581,7 +581,8 @@ class Module(object): "Set maximum sequence length of input tensor to {}".format( max_seq_len)) for tensor_name in [ - "input_ids", "position_ids", "segment_ids", "input_mask" + "input_ids", "position_ids", "segment_ids", "input_mask", + "task_ids" ]: seq_tensor_shape = [-1, max_seq_len, 1] logger.info("The shape of input tensor[{}] set to {}".format( diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index e102756e..ccc1ea34 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader): position_ids = list(range(len(token_ids))) label_ids = [] - for label in example.label: - label_ids.append(int(label)) + if phase == "predict": + label_ids = [0, 0, 0, 0, 0, 0] + else: + for label in example.label: + label_ids.append(self.label_map[label]) if phase != "predict": Record = namedtuple( diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py index 276f0798..80c1856b 100644 --- a/paddlehub/reader/tokenization.py +++ b/paddlehub/reader/tokenization.py @@ -71,7 +71,7 @@ def printable_text(text): def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = io.open(vocab_file, "r", "UTF-8") + fin = io.open(vocab_file, "r", encoding="UTF-8") for num, line in enumerate(fin): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: -- GitLab