diff --git a/demo/image-classification/README.md b/demo/image-classification/README.md index 1246f41a2533e1d0f92d6c78f24bd0b9ed849c78..64d3e0c13a5f9effc06440f96b0c7845d8fceac4 100644 --- a/demo/image-classification/README.md +++ b/demo/image-classification/README.md @@ -36,6 +36,8 @@ $ pip install --upgrade paddlepaddle --checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt --dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat/stanforddogs/indoor67/food101}。默认为flowers --use_gpu: 是否使用GPU进行训练,如果机器支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭 +--use_data_parallel: 是否使用数据并行,打开该开关时,会将数据分散到不同的卡上进行训练(CPU下会分布到不同线程)。默认关闭 +--use_pyreader: 是否使用pyreader进行数据喂入。默认关闭 ``` ## 进行预测 @@ -51,6 +53,7 @@ $ pip install --upgrade paddlepaddle --checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt --dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat}。默认为flowers --use_gpu: 使用使用GPU进行训练,如果本机支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭 +--use_pyreader: 是否使用pyreader进行数据喂入。默认关闭 ``` `注意`:进行预测时,所选择的module,checkpoint_dir,dataset必须和finetune所用的一样 diff --git a/demo/image-classification/img_classifier.py b/demo/image-classification/img_classifier.py index 243c47dc46e49df31736b884f6ab69d7ac0660f3..5083c972d793a0f2b7cf340c39cb25a68b74b130 100644 --- a/demo/image-classification/img_classifier.py +++ b/demo/image-classification/img_classifier.py @@ -1,6 +1,7 @@ #coding:utf-8 import argparse import os +import ast import paddle.fluid as fluid import paddlehub as hub @@ -8,12 +9,14 @@ import numpy as np # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") -parser.add_argument("--use_gpu", type=bool, default=True, help="Whether use GPU for fine-tuning.") -parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") -parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") -parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.") -parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") +parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning.") +parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") +parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") +parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.") +parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") # yapf: enable. module_map = { @@ -56,6 +59,8 @@ def finetune(args): feed_list = [img.name] config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, diff --git a/demo/image-classification/predict.py b/demo/image-classification/predict.py index bedc38618fd858fc3e986baf5858c5220ed60b15..532d813f8673781b14c1501f983723fc5bcbaa9f 100644 --- a/demo/image-classification/predict.py +++ b/demo/image-classification/predict.py @@ -1,6 +1,7 @@ #coding:utf-8 import argparse import os +import ast import paddle.fluid as fluid import paddlehub as hub @@ -8,11 +9,12 @@ import numpy as np # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--use_gpu", type=bool, default=False, help="Whether use GPU for predict.") -parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") -parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") -parser.add_argument("--module", type=str, default="resnet50", help="Module used as a feature extractor.") -parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for predict.") +parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") +parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") +parser.add_argument("--module", type=str, default="resnet50", help="Module used as a feature extractor.") +parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") # yapf: enable. module_map = { @@ -56,6 +58,7 @@ def predict(args): config = hub.RunConfig( use_data_parallel=False, + use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, enable_memory_optim=False, diff --git a/demo/lac/lac_demo.py b/demo/lac/lac_demo.py index d923d5ecb8786312ea107eba61820aa77a3c022b..aeb311154e1473f7a9e49d4f4df9e984b8a0c731 100644 --- a/demo/lac/lac_demo.py +++ b/demo/lac/lac_demo.py @@ -19,10 +19,10 @@ if __name__ == "__main__": results = lac.lexical_analysis(data=inputs) for result in results: if six.PY2: - print(json.dumps( - result['word'], encoding="utf8", ensure_ascii=False)) - print(json.dumps( - result['tag'], encoding="utf8", ensure_ascii=False)) + print( + json.dumps(result['word'], encoding="utf8", ensure_ascii=False)) + print( + json.dumps(result['tag'], encoding="utf8", ensure_ascii=False)) else: print(result['word']) print(result['tag']) diff --git a/demo/senta/senta_demo.py b/demo/senta/senta_demo.py index 07446a86d1d02d77c41ae76434a8574ef1ffd7ca..328517d306069728dd523343594f6930f2aa69ef 100644 --- a/demo/senta/senta_demo.py +++ b/demo/senta/senta_demo.py @@ -21,7 +21,7 @@ if __name__ == "__main__": results[index]["text"] = text for index, result in enumerate(results): if six.PY2: - print(json.dumps( - results[index], encoding="utf8", ensure_ascii=False)) + print( + json.dumps(results[index], encoding="utf8", ensure_ascii=False)) else: print(results[index]) diff --git a/demo/sequence-labeling/predict.py b/demo/sequence-labeling/predict.py index 0c3cf5138ad2e205543749b70ff96d2dd8563cff..fb26b86a3035f2ec61f79e83a50f4bc14354f7e3 100644 --- a/demo/sequence-labeling/predict.py +++ b/demo/sequence-labeling/predict.py @@ -33,15 +33,16 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 parser = argparse.ArgumentParser(__doc__) parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': # loading Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") - input_dict, output_dict, program = module.context( - max_seq_len=args.max_seq_len) + inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) # Sentence labeling dataset reader dataset = hub.dataset.MSRA_NER() @@ -53,70 +54,67 @@ if __name__ == '__main__': place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - with fluid.program_guard(program): - # Use "sequence_outputs" for token-level output. - sequence_output = output_dict["sequence_output"] - - # Define a classfication finetune task by PaddleHub's API - seq_label_task = hub.create_seq_label_task( - feature=sequence_output, - num_classes=dataset.num_labels, - max_seq_len=args.max_seq_len) - - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - # Compared to classification task, we need add seq_len tensor to feedlist - feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - seq_label_task.variable('label').name, - seq_label_task.variable('seq_len').name - ] - - fetch_list = [ - seq_label_task.variable("labels").name, - seq_label_task.variable("infers").name, - seq_label_task.variable("seq_len").name - ] - - # classification probability tensor - probs = seq_label_task.variable("probs") - - # load best model checkpoint - fluid.io.load_persistables(exe, args.checkpoint_dir) - - inference_program = program.clone(for_test=True) - - # calculate the num of label from probs variable shape - num_labels = seq_label_task.variable("probs").shape[1] - - data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) - test_reader = reader.data_generator(phase='test', shuffle=False) - test_examples = dataset.get_test_examples() - total_label, total_infer, total_correct = 0.0, 0.0, 0.0 - for index, batch in enumerate(test_reader()): - np_labels, np_infers, np_lens = exe.run( - feed=data_feeder.feed(batch), - fetch_list=fetch_list, - program=inference_program) - label_num, infer_num, correct_num = chunk_eval( - np_labels, np_infers, np_lens, num_labels) - - total_infer += infer_num - total_label += label_num - total_correct += correct_num - - labels = np_labels.reshape([-1]).astype(np.int32).tolist() + + # Construct transfer learning network + # Use "sequence_output" for token-level output. + sequence_output = outputs["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=False, + use_pyreader=args.use_pyreader, + use_cuda=args.use_gpu, + batch_size=args.batch_size, + enable_memory_optim=False, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + + # Define a sequence labeling finetune task by PaddleHub's API + seq_label_task = hub.SequenceLabelTask( + data_reader=reader, + feature=sequence_output, + feed_list=feed_list, + max_seq_len=args.max_seq_len, + num_classes=dataset.num_labels, + config=config) + + # test data + data = [ + ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], + ["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], + ["其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], + ["有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], + ["不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"], + ] + + results = seq_label_task.predict(data=data) + + for num_batch, batch_results in enumerate(results): + infers = batch_results[0].reshape([-1]).astype(np.int32).tolist() + np_lens = batch_results[1] + + for index, np_len in enumerate(np_lens): + labels = infers[index * args.max_seq_len:(index + 1) * + args.max_seq_len] + label_str = "" count = 0 for label_val in labels: label_str += inv_label_map[label_val] count += 1 - if count == np_lens: + if count == np_len: break - print("%s\tpredict=%s" % (test_examples[index], label_str)) - - precision, recall, f1 = calculate_f1(total_label, total_infer, - total_correct) - print("F1-Score=%f, precision=%f, recall=%f " % (f1, precision, recall)) + # Drop the label results of CLS and SEP Token + print( + "%s\tpredict=%s" % + (data[num_batch * args.batch_size + index][0], label_str[1:-1])) diff --git a/demo/sequence-labeling/run_predict.sh b/demo/sequence-labeling/run_predict.sh index 7c5e37f1b21ab5b30cb9082a654428fe4e1f7094..2c433fd97a5f9ef3b85893fc38a5d74e9d25cc80 100644 --- a/demo/sequence-labeling/run_predict.sh +++ b/demo/sequence-labeling/run_predict.sh @@ -1,4 +1,4 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_sequence_label/best_model" +CKPT_DIR="./ckpt_sequence_label" python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True diff --git a/demo/sequence-labeling/run_sequence_label.sh b/demo/sequence-labeling/run_sequence_label.sh index 55ca0bd3361ba1fed725ef00feb0f9efaef6b5ca..84c29b8241540833d3119b774ce8c288f940d1de 100644 --- a/demo/sequence-labeling/run_sequence_label.sh +++ b/demo/sequence-labeling/run_sequence_label.sh @@ -7,4 +7,6 @@ python -u sequence_label.py \ --num_epoch 3 \ --checkpoint_dir $CKPT_DIR \ --max_seq_len 256 \ - --learning_rate 5e-5 + --learning_rate 5e-5 \ + --use_pyreader True \ + --use_data_parallel True diff --git a/demo/sequence-labeling/sequence_label.py b/demo/sequence-labeling/sequence_label.py index 31a3444eea446944a8e5820603b68fab099cfe14..6544f99b34b4e14bf06e224079cce5bcdde6a127 100644 --- a/demo/sequence-labeling/sequence_label.py +++ b/demo/sequence-labeling/sequence_label.py @@ -30,40 +30,34 @@ parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub ERNIE pretrained model + # Load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Step2: Download dataset and use SequenceLabelReader to read dataset + # Download dataset and use SequenceLabelReader to read dataset dataset = hub.dataset.MSRA_NER() reader = hub.reader.SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - # Step3: construct transfer learning network + # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] - # Define a sequence labeling finetune task by PaddleHub's API - seq_label_task = hub.create_seq_label_task( - feature=sequence_output, - max_seq_len=args.max_seq_len, - num_classes=dataset.num_labels) - # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need # Compared to classification task, we need add seq_len tensor to feedlist feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name, - seq_label_task.variable('label').name, - seq_label_task.variable('seq_len').name + inputs["segment_ids"].name, inputs["input_mask"].name ] # Select a finetune strategy @@ -75,16 +69,23 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) - # Finetune and evaluate model by PaddleHub's API - # will finish training, evaluation, testing, save model automatically - hub.finetune_and_eval( - task=seq_label_task, + # Define a sequence labeling finetune task by PaddleHub's API + seq_label_task = hub.SequenceLabelTask( data_reader=reader, + feature=sequence_output, feed_list=feed_list, + max_seq_len=args.max_seq_len, + num_classes=dataset.num_labels, config=config) + + # Finetune and evaluate model by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + seq_label_task.finetune_and_eval() diff --git a/demo/text-classification/predict.py b/demo/text-classification/predict.py index ff21c51310907a0e7bb1cefbf277d561c2bd616c..1465d0d62077d924086eb55fd2ed8b5a1f02ca64 100644 --- a/demo/text-classification/predict.py +++ b/demo/text-classification/predict.py @@ -31,16 +31,17 @@ import paddlehub as hub # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': # loading Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") - input_dict, output_dict, program = module.context( - max_seq_len=args.max_seq_len) + inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) # Sentence classification dataset reader dataset = hub.dataset.ChnSentiCorp() @@ -51,46 +52,55 @@ if __name__ == '__main__': place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - with fluid.program_guard(program): - # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] - - # Define a classfication finetune task by PaddleHub's API - cls_task = hub.create_text_cls_task( - feature=pooled_output, num_classes=dataset.num_labels) - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - cls_task.variable('label').name - ] + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] - # classificatin probability tensor - probs = cls_task.variable("probs") + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] - pred = fluid.layers.argmax(probs, axis=1) + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=False, + use_pyreader=args.use_pyreader, + use_cuda=args.use_gpu, + batch_size=args.batch_size, + enable_memory_optim=False, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) - # load best model checkpoint - fluid.io.load_persistables(exe, args.checkpoint_dir) + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.TextClassifierTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + num_classes=dataset.num_labels, + config=config) - inference_program = program.clone(for_test=True) + # Data to be prdicted + data = [ + ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], + [ + "还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。" + ], + [ + "前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦" + ], ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"] + ] - data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) - test_reader = reader.data_generator(phase='test', shuffle=False) - test_examples = dataset.get_test_examples() - total = 0 - correct = 0 - for index, batch in enumerate(test_reader()): - pred_v = exe.run( - feed=data_feeder.feed(batch), - fetch_list=[pred.name], - program=inference_program) - total += 1 - if (pred_v[0][0] == int(test_examples[index].label)): - correct += 1 - acc = 1.0 * correct / total - print("%s\tpredict=%s" % (test_examples[index], pred_v[0][0])) - print("accuracy = %f" % acc) + index = 0 + results = cls_task.predict(data=data) + for batch_result in results: + # get predict index + batch_result = np.argmax(batch_result, axis=2)[0] + for result in batch_result: + print("%s\tpredict=%s" % (data[index][0], result)) + index += 1 diff --git a/demo/text-classification/run_classifier.sh b/demo/text-classification/run_classifier.sh index e29cca883c8f3ee05538ba1a22a57819f6bb5727..65f000d732121d0df8fc760210e6fb0c14ef02c4 100644 --- a/demo/text-classification/run_classifier.sh +++ b/demo/text-classification/run_classifier.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=1 +export CUDA_VISIBLE_DEVICES=0 # User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task DATASET="chnsenticorp" @@ -16,4 +16,6 @@ python -u text_classifier.py \ --learning_rate=5e-5 \ --weight_decay=0.01 \ --max_seq_len=128 \ - --num_epoch=3 + --num_epoch=3 \ + --use_pyreader=True \ + --use_data_parallel=True \ diff --git a/demo/text-classification/run_predict.sh b/demo/text-classification/run_predict.sh index 90a6ddfb8d7dfd439ef76982714425924a4dcd33..f70710a1208ac7102cd44d077d9e7195a091e422 100644 --- a/demo/text-classification/run_predict.sh +++ b/demo/text-classification/run_predict.sh @@ -1,4 +1,4 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_chnsenticorp/best_model" +CKPT_DIR="./ckpt_chnsenticorp" python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False diff --git a/demo/text-classification/text_classifier.py b/demo/text-classification/text_classifier.py index 1411e86a82474f28bf8e2ad7e126fa786097f4c5..ca929adb16fa9eca93076dc81d716c8eb5084664 100644 --- a/demo/text-classification/text_classifier.py +++ b/demo/text-classification/text_classifier.py @@ -32,17 +32,19 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub ERNIE pretrained model + # Load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") # module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Step2: Download dataset and use ClassifyReader to read dataset + # Download dataset and use ClassifyReader to read dataset dataset = None if args.dataset.lower() == "chnsenticorp": dataset = hub.dataset.ChnSentiCorp() @@ -58,39 +60,44 @@ if __name__ == '__main__': vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - # Step3: construct transfer learning network + # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Define a classfication finetune task by PaddleHub's API - cls_task = hub.create_text_cls_task( - feature=pooled_output, num_classes=dataset.num_labels) - # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name, - cls_task.variable('label').name + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, ] - # Step4: Select finetune strategy, setup config and finetune + # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, - lr_scheduler="linear_decay", - ) + lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.TextClassifierTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + num_classes=dataset.num_labels, + config=config) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically - hub.finetune_and_eval( - task=cls_task, data_reader=reader, feed_list=feed_list, config=config) + cls_task.finetune_and_eval() diff --git a/paddlehub/commands/clear.py b/paddlehub/commands/clear.py index 0b5624c357f7b67bb7ce656b264c65502d99fc17..0888911213921b6e8bbe23ec0d8470f0191902bb 100644 --- a/paddlehub/commands/clear.py +++ b/paddlehub/commands/clear.py @@ -50,7 +50,7 @@ class ClearCommand(BaseCommand): def __init__(self, name): super(ClearCommand, self).__init__(name) self.show_in_help = True - self.description = "Clear all cache data." + self.description = "Clear all cached data." def cache_dir(self): return CACHE_HOME diff --git a/paddlehub/finetune/task.py b/paddlehub/finetune/task.py index 7d2a755a9b2462434dbd3443c05c01937f7ae38c..0fe9b9c31fb9743dd03602c569c163eccb29741f 100644 --- a/paddlehub/finetune/task.py +++ b/paddlehub/finetune/task.py @@ -110,8 +110,17 @@ class BasicTask(object): # run config self.config = config if config else RunConfig() - self.place, self.device_count = hub.common.get_running_device_info( - self.config) + self.place = self.places[0] + self.device_count = len(self.places) + + if self.config.batch_size < self.device_count: + logger.warning( + "Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions" + .format(self.config.batch_size, self.device_count)) + logger.warning("Batch size automatically adjusted to {}".format( + self.device_count)) + self.config._batch_size = self.device_count + self.exe = fluid.Executor(place=self.place) self.build_strategy = fluid.BuildStrategy() if self.config.enable_memory_optim: @@ -239,6 +248,12 @@ class BasicTask(object): self.exe.run(self.env.startup_program) self._build_env_end_event() + @property + def places(self): + if self.config.use_cuda: + return fluid.framework.cuda_places() + return fluid.framework.cpu_places() + @property def is_train_phase(self): return self.phase in ["train"] @@ -481,6 +496,9 @@ class BasicTask(object): period_run_states = [] for run_step, batch in enumerate(self.reader(), start=1): + if self.config.use_data_parallel and len(batch) < self.device_count: + continue + step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 num_batch_examples = len(batch) @@ -554,10 +572,10 @@ class BasicTask(object): class ClassifierTask(BasicTask): def __init__(self, - data_reader, feature, num_classes, feed_list, + data_reader, startup_program=None, config=None, hidden_units=None): @@ -662,10 +680,10 @@ ImageClassifierTask = ClassifierTask class TextClassifierTask(ClassifierTask): def __init__(self, - data_reader, feature, num_classes, feed_list, + data_reader, startup_program=None, config=None, hidden_units=None): @@ -711,8 +729,8 @@ class SequenceLabelTask(BasicTask): feature, max_seq_len, num_classes, - data_reader, feed_list, + data_reader, startup_program=None, config=None, ): @@ -743,6 +761,14 @@ class SequenceLabelTask(BasicTask): name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.))) + self.ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) + ret_infers = fluid.layers.assign(self.ret_infers) + + self.seq_len = fluid.layers.data( + name="seq_len", shape=[1], dtype='int64') + seq_len = fluid.layers.assign(self.seq_len) + logits = self.logits logits = fluid.layers.flatten(logits, axis=2) logits = fluid.layers.softmax(logits) @@ -761,13 +787,8 @@ class SequenceLabelTask(BasicTask): return loss def _add_metrics(self): - ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1]) - ret_infers = fluid.layers.reshape( - x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) - self.seq_len = fluid.layers.data( - name="seq_len", shape=[1], dtype='int64') - seq_len = fluid.layers.assign(self.seq_len) - return [ret_labels, ret_infers, seq_len] + self.ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1]) + return [self.ret_labels, self.ret_infers, self.seq_len] def _build_env_end_event(self): with self.log_writer.mode(self.phase) as logw: @@ -834,4 +855,14 @@ class SequenceLabelTask(BasicTask): feed_list = [varname for varname in self._base_feed_list] if self.is_train_phase or self.is_test_phase: feed_list += [self.label.name, self.seq_len.name] + else: + feed_list += [self.seq_len.name] return feed_list + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [metric.name for metric in self.metrics] + [self.loss.name] + elif self.is_predict_phase: + return [self.ret_infers.name] + [self.seq_len.name] + return [self.output.name] diff --git a/paddlehub/module/module.py b/paddlehub/module/module.py index df6211bf6795be5961898637513d1c203fd3c383..b47099acc2ce24e766b82a642e81de56c24e9a18 100644 --- a/paddlehub/module/module.py +++ b/paddlehub/module/module.py @@ -463,13 +463,22 @@ class Module(object): with fluid.program_guard(program): result = [] index = 0 - place = fluid.CPUPlace() + if "PADDLEHUB_CUDA_ENABLE" in os.environ: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + if "PADDLEHUB_BATCH_SIZE" in os.environ: + batch_size = os.environ["PADDLEHUB_BATCH_SIZE"] + else: + batch_size = 1 + exe = fluid.Executor(place=place) data = self.processor.preprocess( sign_name=sign_name, data_dict=data) data_format = self.processor.data_format(sign_name=sign_name) reader, feeder = _get_reader_and_feeder(data_format, data, place) - reader = paddle.batch(reader, batch_size=2) + reader = paddle.batch(reader, batch_size=batch_size) for batch in reader(): data_out = exe.run( feed=feeder.feed(batch),