From f04de2349978f6acdbbc972c64063e0b4f9ebd14 Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Mon, 3 Jun 2019 16:51:46 +0800 Subject: [PATCH] Update the demo text-cls and seq-label (#47) * Update teh demo text-cls and seq-label --- demo/sequence-labeling/predict.py | 127 +++++++++----------- demo/sequence-labeling/run_predict.sh | 2 +- demo/sequence-labeling/sequence_label.py | 29 ++--- demo/text-classification/predict.py | 85 +++++++------ demo/text-classification/run_predict.sh | 2 +- demo/text-classification/text_classifier.py | 33 ++--- paddlehub/finetune/task.py | 27 +++-- 7 files changed, 158 insertions(+), 147 deletions(-) diff --git a/demo/sequence-labeling/predict.py b/demo/sequence-labeling/predict.py index 0c3cf513..414de6ec 100644 --- a/demo/sequence-labeling/predict.py +++ b/demo/sequence-labeling/predict.py @@ -33,6 +33,7 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 parser = argparse.ArgumentParser(__doc__) parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") args = parser.parse_args() # yapf: enable. @@ -40,8 +41,7 @@ args = parser.parse_args() if __name__ == '__main__': # loading Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") - input_dict, output_dict, program = module.context( - max_seq_len=args.max_seq_len) + inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) # Sentence labeling dataset reader dataset = hub.dataset.MSRA_NER() @@ -53,70 +53,61 @@ if __name__ == '__main__': place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - with fluid.program_guard(program): - # Use "sequence_outputs" for token-level output. - sequence_output = output_dict["sequence_output"] - - # Define a classfication finetune task by PaddleHub's API - seq_label_task = hub.create_seq_label_task( - feature=sequence_output, - num_classes=dataset.num_labels, - max_seq_len=args.max_seq_len) - - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - # Compared to classification task, we need add seq_len tensor to feedlist - feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - seq_label_task.variable('label').name, - seq_label_task.variable('seq_len').name - ] - - fetch_list = [ - seq_label_task.variable("labels").name, - seq_label_task.variable("infers").name, - seq_label_task.variable("seq_len").name - ] - - # classification probability tensor - probs = seq_label_task.variable("probs") - - # load best model checkpoint - fluid.io.load_persistables(exe, args.checkpoint_dir) - - inference_program = program.clone(for_test=True) - - # calculate the num of label from probs variable shape - num_labels = seq_label_task.variable("probs").shape[1] - - data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) - test_reader = reader.data_generator(phase='test', shuffle=False) - test_examples = dataset.get_test_examples() - total_label, total_infer, total_correct = 0.0, 0.0, 0.0 - for index, batch in enumerate(test_reader()): - np_labels, np_infers, np_lens = exe.run( - feed=data_feeder.feed(batch), - fetch_list=fetch_list, - program=inference_program) - label_num, infer_num, correct_num = chunk_eval( - np_labels, np_infers, np_lens, num_labels) - - total_infer += infer_num - total_label += label_num - total_correct += correct_num - - labels = np_labels.reshape([-1]).astype(np.int32).tolist() - label_str = "" - count = 0 - for label_val in labels: - label_str += inv_label_map[label_val] - count += 1 - if count == np_lens: - break - - print("%s\tpredict=%s" % (test_examples[index], label_str)) - precision, recall, f1 = calculate_f1(total_label, total_infer, - total_correct) - print("F1-Score=%f, precision=%f, recall=%f " % (f1, precision, recall)) + # Construct transfer learning network + # Use "sequence_output" for token-level output. + sequence_output = outputs["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=args.use_gpu, + batch_size=args.batch_size, + enable_memory_optim=False, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + + # Define a sequence labeling finetune task by PaddleHub's API + seq_label_task = hub.SequenceLabelTask( + data_reader=reader, + feature=sequence_output, + feed_list=feed_list, + max_seq_len=args.max_seq_len, + num_classes=dataset.num_labels, + config=config) + + # test data + data = [ + ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], + ["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], + ["其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], + ["有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], + ["不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"], + ] + + results = seq_label_task.predict(data=data)[0] + np_infers = results[0] + np_lens = results[1] + + for index, text in enumerate(data): + labels = np_infers.reshape([-1]).astype( + np.int32).tolist()[args.max_seq_len * index:args.max_seq_len * + (index + 1)] + label_str = "" + count = 0 + for label_val in labels: + label_str += inv_label_map[label_val] + count += 1 + if count == (np_lens[index]): + break + + # Drop the label results of CLS and SEP Token + print("%s\tpredict=%s" % (text[0], label_str[1:-1])) diff --git a/demo/sequence-labeling/run_predict.sh b/demo/sequence-labeling/run_predict.sh index 7c5e37f1..2c433fd9 100644 --- a/demo/sequence-labeling/run_predict.sh +++ b/demo/sequence-labeling/run_predict.sh @@ -1,4 +1,4 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_sequence_label/best_model" +CKPT_DIR="./ckpt_sequence_label" python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True diff --git a/demo/sequence-labeling/sequence_label.py b/demo/sequence-labeling/sequence_label.py index 31a3444e..076df0c4 100644 --- a/demo/sequence-labeling/sequence_label.py +++ b/demo/sequence-labeling/sequence_label.py @@ -34,36 +34,28 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub ERNIE pretrained model + # Load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Step2: Download dataset and use SequenceLabelReader to read dataset + # Download dataset and use SequenceLabelReader to read dataset dataset = hub.dataset.MSRA_NER() reader = hub.reader.SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - # Step3: construct transfer learning network + # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] - # Define a sequence labeling finetune task by PaddleHub's API - seq_label_task = hub.create_seq_label_task( - feature=sequence_output, - max_seq_len=args.max_seq_len, - num_classes=dataset.num_labels) - # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need # Compared to classification task, we need add seq_len tensor to feedlist feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name, - seq_label_task.variable('label').name, - seq_label_task.variable('seq_len').name + inputs["segment_ids"].name, inputs["input_mask"].name ] # Select a finetune strategy @@ -81,10 +73,15 @@ if __name__ == '__main__': checkpoint_dir=args.checkpoint_dir, strategy=strategy) - # Finetune and evaluate model by PaddleHub's API - # will finish training, evaluation, testing, save model automatically - hub.finetune_and_eval( - task=seq_label_task, + # Define a sequence labeling finetune task by PaddleHub's API + seq_label_task = hub.SequenceLabelTask( data_reader=reader, + feature=sequence_output, feed_list=feed_list, + max_seq_len=args.max_seq_len, + num_classes=dataset.num_labels, config=config) + + # Finetune and evaluate model by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + seq_label_task.finetune_and_eval() diff --git a/demo/text-classification/predict.py b/demo/text-classification/predict.py index ff21c513..3586529f 100644 --- a/demo/text-classification/predict.py +++ b/demo/text-classification/predict.py @@ -31,6 +31,7 @@ import paddlehub as hub # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") args = parser.parse_args() @@ -39,8 +40,7 @@ args = parser.parse_args() if __name__ == '__main__': # loading Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") - input_dict, output_dict, program = module.context( - max_seq_len=args.max_seq_len) + inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) # Sentence classification dataset reader dataset = hub.dataset.ChnSentiCorp() @@ -51,46 +51,53 @@ if __name__ == '__main__': place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - with fluid.program_guard(program): - # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] - - # Define a classfication finetune task by PaddleHub's API - cls_task = hub.create_text_cls_task( - feature=pooled_output, num_classes=dataset.num_labels) - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - cls_task.variable('label').name - ] + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] - # classificatin probability tensor - probs = cls_task.variable("probs") + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] - pred = fluid.layers.argmax(probs, axis=1) + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=args.use_gpu, + batch_size=args.batch_size, + enable_memory_optim=False, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) - # load best model checkpoint - fluid.io.load_persistables(exe, args.checkpoint_dir) + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.TextClassifierTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + num_classes=dataset.num_labels, + config=config) - inference_program = program.clone(for_test=True) + # Data to be prdicted + data = [ + ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], + [ + "还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。" + ], + [ + "前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦" + ], ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"] + ] - data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) - test_reader = reader.data_generator(phase='test', shuffle=False) - test_examples = dataset.get_test_examples() - total = 0 - correct = 0 - for index, batch in enumerate(test_reader()): - pred_v = exe.run( - feed=data_feeder.feed(batch), - fetch_list=[pred.name], - program=inference_program) - total += 1 - if (pred_v[0][0] == int(test_examples[index].label)): - correct += 1 - acc = 1.0 * correct / total - print("%s\tpredict=%s" % (test_examples[index], pred_v[0][0])) - print("accuracy = %f" % acc) + index = 0 + results = cls_task.predict(data=data) + for batch_result in results: + # get predict index + batch_result = np.argmax(batch_result, axis=2)[0] + for result in batch_result: + print("%s\tpredict=%s" % (data[index][0], result)) + index += 1 diff --git a/demo/text-classification/run_predict.sh b/demo/text-classification/run_predict.sh index 90a6ddfb..f70710a1 100644 --- a/demo/text-classification/run_predict.sh +++ b/demo/text-classification/run_predict.sh @@ -1,4 +1,4 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_chnsenticorp/best_model" +CKPT_DIR="./ckpt_chnsenticorp" python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False diff --git a/demo/text-classification/text_classifier.py b/demo/text-classification/text_classifier.py index 1411e86a..f44b4b75 100644 --- a/demo/text-classification/text_classifier.py +++ b/demo/text-classification/text_classifier.py @@ -36,13 +36,13 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub ERNIE pretrained model + # Load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") # module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Step2: Download dataset and use ClassifyReader to read dataset + # Download dataset and use ClassifyReader to read dataset dataset = None if args.dataset.lower() == "chnsenticorp": dataset = hub.dataset.ChnSentiCorp() @@ -58,29 +58,25 @@ if __name__ == '__main__': vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - # Step3: construct transfer learning network + # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Define a classfication finetune task by PaddleHub's API - cls_task = hub.create_text_cls_task( - feature=pooled_output, num_classes=dataset.num_labels) - # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name, - cls_task.variable('label').name + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, ] - # Step4: Select finetune strategy, setup config and finetune + # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, - lr_scheduler="linear_decay", - ) + lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( @@ -90,7 +86,14 @@ if __name__ == '__main__': checkpoint_dir=args.checkpoint_dir, strategy=strategy) + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.TextClassifierTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + num_classes=dataset.num_labels, + config=config) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically - hub.finetune_and_eval( - task=cls_task, data_reader=reader, feed_list=feed_list, config=config) + cls_task.finetune_and_eval() diff --git a/paddlehub/finetune/task.py b/paddlehub/finetune/task.py index 7d2a755a..e840fc6c 100644 --- a/paddlehub/finetune/task.py +++ b/paddlehub/finetune/task.py @@ -743,6 +743,14 @@ class SequenceLabelTask(BasicTask): name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.))) + self.ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) + ret_infers = fluid.layers.assign(self.ret_infers) + + self.seq_len = fluid.layers.data( + name="seq_len", shape=[1], dtype='int64') + seq_len = fluid.layers.assign(self.seq_len) + logits = self.logits logits = fluid.layers.flatten(logits, axis=2) logits = fluid.layers.softmax(logits) @@ -761,13 +769,8 @@ class SequenceLabelTask(BasicTask): return loss def _add_metrics(self): - ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1]) - ret_infers = fluid.layers.reshape( - x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) - self.seq_len = fluid.layers.data( - name="seq_len", shape=[1], dtype='int64') - seq_len = fluid.layers.assign(self.seq_len) - return [ret_labels, ret_infers, seq_len] + self.ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1]) + return [self.ret_labels, self.ret_infers, self.seq_len] def _build_env_end_event(self): with self.log_writer.mode(self.phase) as logw: @@ -834,4 +837,14 @@ class SequenceLabelTask(BasicTask): feed_list = [varname for varname in self._base_feed_list] if self.is_train_phase or self.is_test_phase: feed_list += [self.label.name, self.seq_len.name] + else: + feed_list += [self.seq_len.name] return feed_list + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [metric.name for metric in self.metrics] + [self.loss.name] + elif self.is_predict_phase: + return [self.ret_infers.name] + [self.seq_len.name] + return [self.output.name] -- GitLab