From 06ec43f57b7e24ac103438c7a511c5b354ecfe51 Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Mon, 23 Dec 2019 17:46:47 +0800 Subject: [PATCH] Simpliy Demo (#273) * update demo * simplify img cls demo * remove elmo demo * simplify multi-label cls demo * simplify multi-label cls demo * simplify img cls demo * simplify text cls demo * simplify lac demo * simplify ssd demo * simplify qa cls demo * simplify regression demo * simplify reading comprehension demo * simplify sequence labeling demo * simplify senta demo --- demo/elmo/README.md | 150 --------------- demo/elmo/elmo_finetune.py | 167 ---------------- demo/elmo/predict.py | 179 ------------------ demo/elmo/run_elmo_finetune.sh | 10 - demo/elmo/run_predict.sh | 5 - demo/image-classification/img_classifier.py | 14 +- demo/image-classification/predict.py | 14 +- demo/lac/cli_demo.sh | 1 - demo/lac/test/test.txt | 3 - demo/lac/test/test.yml | 4 - .../multi_label_classifier.py | 20 +- demo/multi-label-classification/predict.py | 24 +-- .../run_classifier.sh | 9 +- demo/qa_classification/classifier.py | 8 +- demo/qa_classification/predict.py | 10 +- demo/qa_classification/run_classifier.sh | 8 +- demo/reading-comprehension/predict.py | 53 +----- .../reading_comprehension.py | 39 +--- demo/reading-comprehension/run_finetune.sh | 13 +- demo/reading-comprehension/run_predict.sh | 14 +- demo/regression/predict.py | 42 ++-- demo/regression/regression.py | 23 +-- demo/regression/run_predict.sh | 10 +- demo/regression/run_regssion.sh | 10 +- demo/senta/cli_demo.sh | 1 - demo/senta/predict.py | 29 ++- demo/senta/run_finetune.sh | 5 +- demo/senta/run_predict.sh | 4 +- demo/senta/senta_demo.py | 3 +- demo/senta/senta_finetune.py | 14 +- demo/sequence-labeling/predict.py | 9 +- demo/sequence-labeling/run_sequence_label.sh | 2 +- demo/sequence-labeling/sequence_label.py | 16 +- demo/ssd/cli_demo.sh | 1 - demo/text-classification/predict.py | 110 ++--------- demo/text-classification/run_classifier.sh | 9 +- demo/text-classification/run_predict.sh | 15 +- demo/text-classification/text_classifier.py | 120 ++---------- 38 files changed, 177 insertions(+), 991 deletions(-) delete mode 100644 demo/elmo/README.md delete mode 100644 demo/elmo/elmo_finetune.py delete mode 100644 demo/elmo/predict.py delete mode 100644 demo/elmo/run_elmo_finetune.sh delete mode 100644 demo/elmo/run_predict.sh delete mode 100644 demo/lac/cli_demo.sh delete mode 100644 demo/lac/test/test.txt delete mode 100644 demo/lac/test/test.yml delete mode 100644 demo/senta/cli_demo.sh delete mode 100644 demo/ssd/cli_demo.sh diff --git a/demo/elmo/README.md b/demo/elmo/README.md deleted file mode 100644 index 5c655f01..00000000 --- a/demo/elmo/README.md +++ /dev/null @@ -1,150 +0,0 @@ -# PaddleHub 文本分类 - -本示例将展示如何使用PaddleHub Finetune API以及加载ELMo预训练中文word embedding在中文情感分析数据集ChnSentiCorp上完成分类任务。 - -## 如何开始Finetune - -在完成安装PaddlePaddle与PaddleHub后,通过执行脚本`sh run_elmo_finetune.sh`即可开始使用ELMo对ChnSentiCorp数据集进行Finetune。 - -其中脚本参数说明如下: - -```bash -# 模型相关 ---batch_size: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数use ---use_gpu: 是否使用GPU进行FineTune,默认为True ---learning_rate: Finetune的最大学习率 ---weight_decay: 控制正则项力度的参数,用于防止过拟合,默认为0.01 ---warmup_proportion: 学习率warmup策略的比例,如果0.1,则学习率会在前10%训练step的过程中从0慢慢增长到learning_rate, 而后再缓慢衰减,默认为0 ---num_epoch: Finetune迭代的轮数 - - -# 任务相关 ---checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型 -``` - -## 代码步骤 - -使用PaddleHub Finetune API进行Finetune可以分为4个步骤 - -### Step1: 加载预训练模型 - -```python -module = hub.Module(name="elmo") -inputs, outputs, program = module.context(trainable=True) -``` - -### Step2: 准备数据集并使用LACClassifyReader读取数据 -```python -dataset = hub.dataset.ChnSentiCorp() -reader = hub.reader.LACClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path()) -``` - -其中数据集的准备代码可以参考 [chnsenticorp.py](https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.2/paddlehub/dataset/chnsenticorp.py) - -`hub.dataset.ChnSentiCorp()` 会自动从网络下载数据集并解压到用户目录下`$HOME/.paddlehub/dataset`目录 - -`module.get_vaocab_path()` 会返回预训练模型对应的词表 - -LACClassifyReader中的`data_generator`会自动按照模型对应词表对数据进行切词,以迭代器的方式返回ELMo所需要的Tensor格式,包括`word_ids`. - -### Step3:选择优化策略和运行配置 - -```python -strategy = hub.AdamWeightDecayStrategy( - learning_rate=5e-5, - weight_decay=0.01, - warmup_proportion=0.0, - lr_scheduler="linear_decay", -) - -config = hub.RunConfig(use_cuda=True, use_data_parallel=True, use_pyreader=False, num_epoch=3, batch_size=32, strategy=strategy) -``` - -#### 优化策略 -针对ERNIE与BERT类任务,PaddleHub封装了适合这一任务的迁移学习优化策略`AdamWeightDecayStrategy` - -* `learning_rate`: Finetune过程中的最大学习率; -* `weight_decay`: 模型的正则项参数,默认0.01,如果模型有过拟合倾向,可适当调高这一参数; -* `warmup_proportion`: 如果warmup_proportion>0, 例如0.1, 则学习率会在前10%的steps中线性增长至最高值learning_rate; -* `lr_scheduler`: 有两种策略可选(1) `linear_decay`策略学习率会在最高点后以线性方式衰减; `noam_decay`策略学习率会在最高点以多项式形式衰减; - -#### 运行配置 -`RunConfig` 主要控制Finetune的训练,包含以下可控制的参数: - -* `log_interval`: 进度日志打印间隔,默认每10个step打印一次 -* `eval_interval`: 模型评估的间隔,默认每100个step评估一次验证集 -* `save_ckpt_interval`: 模型保存间隔,请根据任务大小配置,默认只保存验证集效果最好的模型和训练结束的模型 -* `use_cuda`: 是否使用GPU训练,默认为False -* `use_data_parallel`: 是否使用并行计算,默认False。打开该功能依赖nccl库 -* `use_pyreader`: 是否使用pyreader,默认False -* `checkpoint_dir`: 模型checkpoint保存路径, 若用户没有指定,程序会自动生成 -* `num_epoch`: finetune的轮数 -* `batch_size`: 训练的批大小,如果使用GPU,请根据实际情况调整batch_size -* `enable_memory_optim`: 是否使用内存优化, 默认为True -* `strategy`: Finetune优化策略 - -**Note**: 当使用LACClassifyReader时,use_pyreader应该为False。 - -### Step4: 构建网络并创建分类迁移任务进行Finetune - -有了合适的预训练模型和准备要迁移的数据集后,我们开始组建一个Task。 ->* 获取module的上下文环境,包括输入和输出的变量,以及Paddle Program; ->* 从输出变量中找到输入单词对应的elmo_embedding, 并拼接上随机初始化word embedding; ->* 在拼接embedding输入gru网络,进行文本分类,生成Task; - -```python -word_ids = inputs["word_ids"] -elmo_embedding = outputs["elmo_embed"] - -feed_list = [word_ids.name] - -switch_main_program(program) - -word_embed_dims = 128 -word_embedding = fluid.layers.embedding( - input=word_ids, - size=[word_dict_len, word_embed_dims], - param_attr=fluid.ParamAttr( - learning_rate=30, - initializer=fluid.initializer.Uniform(low=-0.1, high=0.1))) - -input_feature = fluid.layers.concat( - input=[elmo_embedding, word_embedding], axis=1) - -fc = gru_net(program, input_feature) - -elmo_task = hub.TextClassifierTask( - data_reader=reader, - feature=fc, - feed_list=feed_list, - num_classes=dataset.num_labels, - config=config) - -elmo_task.finetune_and_eval() -``` -**NOTE:** -1. `outputs["elmo_embed"]`返回了ELMo模型预训练的word embedding。 -2. `hub.TextClassifierTask`通过输入特征,label与迁移的类别数,可以生成适用于文本分类的迁移任务`TextClassifierTask` - -## 可视化 - -Finetune API训练过程中会自动对关键训练指标进行打点,启动程序后执行下面命令 -```bash -$ tensorboard --logdir $CKPT_DIR/visualization --host ${HOST_IP} --port ${PORT_NUM} -``` -其中${HOST_IP}为本机IP地址,${PORT_NUM}为可用端口号,如本机IP地址为192.168.0.1,端口号8040,用浏览器打开192.168.0.1:8040,即可看到训练过程中指标的变化情况 - -## 模型预测 - -通过Finetune完成模型训练后,在对应的ckpt目录下,会自动保存验证集上效果最好的模型。 -配置脚本参数 -``` -CKPT_DIR="./ckpt_chnsentiment" -python predict.py --checkpoint_dir --use_gpu True -``` -其中CKPT_DIR为Finetune API保存最佳模型的路径 - -参数配置正确后,请执行脚本`sh run_predict.sh`,即可看到以下文本分类预测结果, 以及最终准确率。 -如需了解更多预测步骤,请参考`predict.py` diff --git a/demo/elmo/elmo_finetune.py b/demo/elmo/elmo_finetune.py deleted file mode 100644 index f39cd1ed..00000000 --- a/demo/elmo/elmo_finetune.py +++ /dev/null @@ -1,167 +0,0 @@ -#coding:utf-8 -import argparse -import ast -import io -import numpy as np - -from paddle.fluid.framework import switch_main_program -import paddle.fluid as fluid -import paddlehub as hub - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") -parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") -parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") -parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate used to train with warmup.") -parser.add_argument("--weight_decay", type=float, default=5, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.05, help="Warmup proportion params for warmup strategy") -args = parser.parse_args() -# yapf: enable. - - -def bow_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - bow = fluid.layers.sequence_pool(input=input_feature, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) - fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") - fc = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") - - return fc - - -def cnn_net(program, input_feature, win_size=3, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - conv_3 = fluid.nets.sequence_conv_pool( - input=input_feature, - num_filters=hid_dim, - filter_size=win_size, - act="relu", - pool_type="max") - fc = fluid.layers.fc(input=conv_3, size=hid_dim2) - - return fc - - -def gru_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 3) - gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) - gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') - gru_max_tanh = fluid.layers.tanh(gru_max) - fc = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') - - return fc - - -def bilstm_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - rfc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - - lstm_h, c = fluid.layers.dynamic_lstm( - input=fc0, size=hid_dim * 4, is_reverse=False) - rlstm_h, c = fluid.layers.dynamic_lstm( - input=rfc0, size=hid_dim * 4, is_reverse=True) - - # extract last step - lstm_last = fluid.layers.sequence_last_step(input=lstm_h) - rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) - - lstm_last_tanh = fluid.layers.tanh(lstm_last) - rlstm_last_tanh = fluid.layers.tanh(rlstm_last) - - # concat layer - lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) - # full connect layer - fc = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh') - - return fc - - -def lstm_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - lstm_h, c = fluid.layers.dynamic_lstm( - input=fc0, size=hid_dim * 4, is_reverse=False) - lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) - fc = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') - - return fc - - -if __name__ == '__main__': - # Step1: load Paddlehub elmo pretrained model - module = hub.Module(name="elmo") - inputs, outputs, program = module.context(trainable=True) - - # Step2: Download dataset and use LACClassifyReade to read dataset - dataset = hub.dataset.ChnSentiCorp() - - reader = hub.reader.LACClassifyReader( - dataset=dataset, vocab_path=module.get_vocab_path()) - word_dict_len = len(reader.vocab) - - word_ids = inputs["word_ids"] - elmo_embedding = outputs["elmo_embed"] - - # Step3: switch program and build network - # Choose the net which you would like: bow, cnn, gru, bilstm, lstm - switch_main_program(program) - - # Embedding layer - word_embed_dims = 128 - word_embedding = fluid.layers.embedding( - input=word_ids, - size=[word_dict_len, word_embed_dims], - param_attr=fluid.ParamAttr( - learning_rate=30, - initializer=fluid.initializer.Uniform(low=-0.1, high=0.1))) - - # Add elmo embedding - input_feature = fluid.layers.concat( - input=[elmo_embedding, word_embedding], axis=1) - - # Choose the net which you would like: bow, cnn, gru, bilstm, lstm - # We recommend you to choose the gru_net - fc = gru_net(program, input_feature) - - # Setup feed list for data feeder - # Must feed all the tensor of senta's module need - feed_list = [word_ids.name] - - # Step4: Select finetune strategy, setup config and finetune - strategy = hub.AdamWeightDecayStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay", - warmup_proportion=args.warmup_proportion) - - # Step5: Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - use_cuda=args.use_gpu, - use_data_parallel=True, - use_pyreader=False, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # Step6: Define a classfication finetune task by PaddleHub's API - elmo_task = hub.TextClassifierTask( - data_reader=reader, - feature=fc, - feed_list=feed_list, - num_classes=dataset.num_labels, - config=config) - - # Finetune and evaluate by PaddleHub's API - # will finish training, evaluation, testing, save model automatically - elmo_task.finetune_and_eval() diff --git a/demo/elmo/predict.py b/demo/elmo/predict.py deleted file mode 100644 index c37e4aac..00000000 --- a/demo/elmo/predict.py +++ /dev/null @@ -1,179 +0,0 @@ -#coding:utf-8 -import argparse -import ast -import io -import numpy as np - -from paddle.fluid.framework import switch_main_program -import paddle.fluid as fluid -import paddlehub as hub - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") -parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") -parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate used to train with warmup.") -parser.add_argument("--weight_decay", type=float, default=5, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.05, help="Warmup proportion params for warmup strategy") -args = parser.parse_args() -# yapf: enable. - - -def bow_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - bow = fluid.layers.sequence_pool(input=input_feature, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) - fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") - fc = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") - - return fc - - -def cnn_net(program, input_feature, win_size=3, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - conv_3 = fluid.nets.sequence_conv_pool( - input=input_feature, - num_filters=hid_dim, - filter_size=win_size, - act="relu", - pool_type="max") - fc = fluid.layers.fc(input=conv_3, size=hid_dim2) - - return fc - - -def gru_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 3) - gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) - gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') - gru_max_tanh = fluid.layers.tanh(gru_max) - fc = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') - - return fc - - -def bilstm_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - rfc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - - lstm_h, c = fluid.layers.dynamic_lstm( - input=fc0, size=hid_dim * 4, is_reverse=False) - rlstm_h, c = fluid.layers.dynamic_lstm( - input=rfc0, size=hid_dim * 4, is_reverse=True) - - # extract last step - lstm_last = fluid.layers.sequence_last_step(input=lstm_h) - rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) - - lstm_last_tanh = fluid.layers.tanh(lstm_last) - rlstm_last_tanh = fluid.layers.tanh(rlstm_last) - - # concat layer - lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) - # full connect layer - fc = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh') - - return fc - - -def lstm_net(program, input_feature, hid_dim=128, hid_dim2=96): - switch_main_program(program) - - fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4) - lstm_h, c = fluid.layers.dynamic_lstm( - input=fc0, size=hid_dim * 4, is_reverse=False) - lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) - fc = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') - - return fc - - -if __name__ == '__main__': - # Step1: load Paddlehub elmo pretrained model - module = hub.Module(name="elmo") - inputs, outputs, program = module.context(trainable=True) - - # Step2: Download dataset and use LACClassifyReade to read dataset - dataset = hub.dataset.ChnSentiCorp() - - reader = hub.reader.LACClassifyReader( - dataset=dataset, vocab_path=module.get_vocab_path()) - word_dict_len = len(reader.vocab) - - word_ids = inputs["word_ids"] - elmo_embedding = outputs["elmo_embed"] - - # Step3: switch program and build network - # Choose the net which you would like: bow, cnn, gru, bilstm, lstm - switch_main_program(program) - - # Embedding layer - word_embed_dims = 128 - word_embedding = fluid.layers.embedding( - input=word_ids, - size=[word_dict_len, word_embed_dims], - param_attr=fluid.ParamAttr( - learning_rate=30, - initializer=fluid.initializer.Uniform(low=-0.1, high=0.1))) - - # Add elmo embedding - input_feature = fluid.layers.concat( - input=[elmo_embedding, word_embedding], axis=1) - - # Choose the net which you would like: bow, cnn, gru, bilstm, lstm - # We recommend you to choose the gru_net - fc = gru_net(program, input_feature) - - # Setup feed list for data feeder - # Must feed all the tensor of senta's module need - feed_list = [word_ids.name] - - # Step4: Select finetune strategy, setup config and finetune - strategy = hub.AdamWeightDecayStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay", - warmup_proportion=args.warmup_proportion) - - # Step5: Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - use_cuda=args.use_gpu, - use_data_parallel=True, - use_pyreader=False, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # Step6: Define a classfication finetune task by PaddleHub's API - elmo_task = hub.TextClassifierTask( - data_reader=reader, - feature=fc, - feed_list=feed_list, - num_classes=dataset.num_labels, - config=config) - - # Data to be prdicted - data = [ - "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小", - "还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。", - "前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦", - "19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~" - ] - - index = 0 - run_states = elmo_task.predict(data=data) - results = [run_state.run_results for run_state in run_states] - for batch_result in results: - # get predict index - batch_result = np.argmax(batch_result, axis=2)[0] - for result in batch_result: - print("%s\tpredict=%s" % (data[index], result)) - index += 1 diff --git a/demo/elmo/run_elmo_finetune.sh b/demo/elmo/run_elmo_finetune.sh deleted file mode 100644 index b9d76c64..00000000 --- a/demo/elmo/run_elmo_finetune.sh +++ /dev/null @@ -1,10 +0,0 @@ -export FLAGS_eager_delete_tensor_gb=0.0 -export CUDA_VISIBLE_DEVICES=0 - -python -u elmo_finetune.py \ - --batch_size=32 \ - --use_gpu=True \ - --checkpoint_dir="./ckpt_chnsenticorp" \ - --learning_rate=1e-4 \ - --weight_decay=1 \ - --num_epoch=3 diff --git a/demo/elmo/run_predict.sh b/demo/elmo/run_predict.sh deleted file mode 100644 index cfe01d6b..00000000 --- a/demo/elmo/run_predict.sh +++ /dev/null @@ -1,5 +0,0 @@ -export FLAGS_eager_delete_tensor_gb=0.0 -export CUDA_VISIBLE_DEVICES=0 - -CKPT_DIR="./ckpt_chnsenticorp" -python -u predict.py --checkpoint_dir $CKPT_DIR --use_gpu True diff --git a/demo/image-classification/img_classifier.py b/demo/image-classification/img_classifier.py index 27acea75..40e170a5 100644 --- a/demo/image-classification/img_classifier.py +++ b/demo/image-classification/img_classifier.py @@ -15,7 +15,6 @@ parser.add_argument("--checkpoint_dir", type=str, default="pad parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.") parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") # yapf: enable. @@ -30,9 +29,11 @@ module_map = { def finetune(args): + # Load Paddlehub pretrained model module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) + # Download dataset if args.dataset.lower() == "flowers": dataset = hub.dataset.Flowers() elif args.dataset.lower() == "dogcat": @@ -46,6 +47,7 @@ def finetune(args): else: raise ValueError("%s dataset is not defined" % args.dataset) + # Use ImageClassificationReader to read dataset data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), @@ -55,25 +57,27 @@ def finetune(args): feature_map = output_dict["feature_map"] - img = input_dict["image"] - feed_list = [img.name] + # Setup feed list for data feeder + feed_list = [input_dict["image"].name] + # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + # Define a reading comprehension finetune task by PaddleHub's API task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) + + # Finetune by PaddleHub's API task.finetune_and_eval() diff --git a/demo/image-classification/predict.py b/demo/image-classification/predict.py index ab7d34d9..f3e6a3fe 100644 --- a/demo/image-classification/predict.py +++ b/demo/image-classification/predict.py @@ -14,7 +14,6 @@ parser.add_argument("--checkpoint_dir", type=str, default="pad parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--module", type=str, default="resnet50", help="Module used as a feature extractor.") parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") # yapf: enable. module_map = { @@ -28,9 +27,11 @@ module_map = { def predict(args): + # Load Paddlehub pretrained model module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) + # Download dataset if args.dataset.lower() == "flowers": dataset = hub.dataset.Flowers() elif args.dataset.lower() == "dogcat": @@ -44,6 +45,7 @@ def predict(args): else: raise ValueError("%s dataset is not defined" % args.dataset) + # Use ImageClassificationReader to read dataset data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), @@ -53,19 +55,19 @@ def predict(args): feature_map = output_dict["feature_map"] - img = input_dict["image"] - feed_list = [img.name] + # Setup feed list for data feeder + feed_list = [input_dict["image"].name] + # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) - task = hub.ClassifierTask( + # Define a reading comprehension finetune task by PaddleHub's API + task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, diff --git a/demo/lac/cli_demo.sh b/demo/lac/cli_demo.sh deleted file mode 100644 index 8f68175d..00000000 --- a/demo/lac/cli_demo.sh +++ /dev/null @@ -1 +0,0 @@ -python ../../paddlehub/commands/hub.py run lac --input_file test/test.txt diff --git a/demo/lac/test/test.txt b/demo/lac/test/test.txt deleted file mode 100644 index 800151bd..00000000 --- a/demo/lac/test/test.txt +++ /dev/null @@ -1,3 +0,0 @@ -今天是个好日子 -天气预报说今天要下雨 -下一班地铁马上就要到了 diff --git a/demo/lac/test/test.yml b/demo/lac/test/test.yml deleted file mode 100644 index 51314996..00000000 --- a/demo/lac/test/test.yml +++ /dev/null @@ -1,4 +0,0 @@ -input_data: - text: - type : TEXT - key : TEXT_INPUT diff --git a/demo/multi-label-classification/multi_label_classifier.py b/demo/multi-label-classification/multi_label_classifier.py index 34c535dc..f958902f 100644 --- a/demo/multi-label-classification/multi_label_classifier.py +++ b/demo/multi-label-classification/multi_label_classifier.py @@ -34,35 +34,33 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Load Paddlehub BERT pretrained model + # Load Paddlehub ERNIE 2.0 pretrained model module = hub.Module(name="ernie_v2_eng_base") - inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Setup feed list for data feeder - feed_list = [ - inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name - ] - # Download dataset and use MultiLabelReader to read dataset dataset = hub.dataset.Toxic() - reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name + ] + # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay") + learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( diff --git a/demo/multi-label-classification/predict.py b/demo/multi-label-classification/predict.py index c5052b96..0061d863 100644 --- a/demo/multi-label-classification/predict.py +++ b/demo/multi-label-classification/predict.py @@ -40,12 +40,18 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Load Paddlehub BERT pretrained model - module = hub.Module(name="ernie_eng_base.hub_module") - + # Load Paddlehub ERNIE 2.0 pretrained model + module = hub.Module(name="ernie_v2_eng_base") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + # Download dataset and use MultiLabelReader to read dataset + dataset = hub.dataset.Toxic() + reader = hub.reader.MultiLabelClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + # Setup feed list for data feeder feed_list = [ inputs["input_ids"].name, @@ -54,14 +60,6 @@ if __name__ == '__main__': inputs["input_mask"].name, ] - # Download dataset and use MultiLabelReader to read dataset - dataset = hub.dataset.Toxic() - - reader = hub.reader.MultiLabelClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) - # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. @@ -70,10 +68,8 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=False, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) @@ -85,7 +81,7 @@ if __name__ == '__main__': num_classes=dataset.num_labels, config=config) - # Data to be prdicted + # Data to be predicted data = [ [ "Yes you did. And you admitted to doing it. See the Warren Kinsella talk page." diff --git a/demo/multi-label-classification/run_classifier.sh b/demo/multi-label-classification/run_classifier.sh index 93b88833..b02cbb87 100644 --- a/demo/multi-label-classification/run_classifier.sh +++ b/demo/multi-label-classification/run_classifier.sh @@ -1,13 +1,7 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task -DATASET="toxic" -CKPT_DIR="./ckpt_${DATASET}" -# Recommending hyper parameters for difference task -# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 -# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5 -# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5 +CKPT_DIR="./ckpt_toxic" python -u multi_label_classifier.py \ --batch_size=32 \ @@ -16,4 +10,5 @@ python -u multi_label_classifier.py \ --learning_rate=5e-5 \ --weight_decay=0.01 \ --max_seq_len=128 \ + --warmup_proportion=0.1 \ --num_epoch=3 diff --git a/demo/qa_classification/classifier.py b/demo/qa_classification/classifier.py index 6e7ff013..4c1fad80 100644 --- a/demo/qa_classification/classifier.py +++ b/demo/qa_classification/classifier.py @@ -30,7 +30,6 @@ parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. @@ -38,13 +37,11 @@ args = parser.parse_args() if __name__ == '__main__': # Load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") - # module = hub.Module(name="bert_chinese_L-12_H-768_A-12") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) # Download dataset and use ClassifyReader to read dataset dataset = hub.dataset.NLPCC_DBQA() - reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), @@ -66,14 +63,13 @@ if __name__ == '__main__': # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay") + learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, diff --git a/demo/qa_classification/predict.py b/demo/qa_classification/predict.py index 813cc91e..3dfbcf68 100644 --- a/demo/qa_classification/predict.py +++ b/demo/qa_classification/predict.py @@ -34,7 +34,6 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") args = parser.parse_args() # yapf: enable. @@ -50,9 +49,6 @@ if __name__ == '__main__': vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. @@ -70,10 +66,8 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) @@ -100,5 +94,5 @@ if __name__ == '__main__': max_probs = batch_result[0][0, 1] max_flag = index - print("question:%s\tthe predict answer:%s\t" % (data[max_flag][0], - data[max_flag][1])) + print("question:%s\tthe predicted matched answer:%s\t" % + (data[max_flag][0], data[max_flag][1])) diff --git a/demo/qa_classification/run_classifier.sh b/demo/qa_classification/run_classifier.sh index 6f6abc7b..a99b5ec3 100644 --- a/demo/qa_classification/run_classifier.sh +++ b/demo/qa_classification/run_classifier.sh @@ -2,10 +2,6 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 CKPT_DIR="./ckpt_qa" -# Recommending hyper parameters for difference task -# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 -# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5 -# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5 python -u classifier.py \ --batch_size=24 \ @@ -13,7 +9,7 @@ python -u classifier.py \ --checkpoint_dir=${CKPT_DIR} \ --learning_rate=5e-5 \ --weight_decay=0.01 \ + --warmup_proportion=0.1 \ --max_seq_len=128 \ --num_epoch=3 \ - --use_pyreader=False \ - --use_data_parallel=False \ + --use_data_parallel=True \ diff --git a/demo/reading-comprehension/predict.py b/demo/reading-comprehension/predict.py index 2adf9b1b..b9247152 100644 --- a/demo/reading-comprehension/predict.py +++ b/demo/reading-comprehension/predict.py @@ -41,44 +41,23 @@ hub.common.logger.logger.setLevel("INFO") parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--learning_rate", type=float, default=4e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint.") -parser.add_argument("--result_dir", type=str, default=None, help="Directory to predicted results to be written.") parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") -parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") -parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.") -parser.add_argument("--n_best_size", type=int, default=20, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") -parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") -parser.add_argument("--dataset", type=str, default="squad", help="Support squad, squad2.0, drcd and cmrc2018") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Download dataset and use ReadingComprehensionReader to read dataset - if args.dataset == "squad": - dataset = hub.dataset.SQUAD(version_2_with_negative=False) - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset == "squad2.0" or args.dataset == "squad2": - args.dataset = "squad2.0" - dataset = hub.dataset.SQUAD(version_2_with_negative=True) - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset == "drcd": - dataset = hub.dataset.DRCD() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - elif args.dataset == "cmrc2018": - dataset = hub.dataset.CMRC2018() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - else: - raise Exception( - "Only support datasets: squad, squad2.0, drcd and cmrc2018") - + # Load Paddlehub BERT pretrained model + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + # Download dataset and use ReadingComprehensionReader to read dataset + # If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True + dataset = hub.dataset.SQUAD(version_2_with_negative=False) + # dataset = hub.dataset.SQUAD(version_2_with_negative=True) + reader = hub.reader.ReadingComprehensionReader( dataset=dataset, vocab_path=module.get_vocab_path(), @@ -97,25 +76,13 @@ if __name__ == '__main__': inputs["input_mask"].name, ] - # Select finetune strategy, setup config and finetune - strategy = hub.AdamWeightDecayStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_proportion=args.warmup_proportion, - lr_scheduler="linear_decay") - # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( - log_interval=10, - use_pyreader=args.use_pyreader, - use_data_parallel=args.use_data_parallel, - save_ckpt_interval=100, + use_data_parallel=False, use_cuda=args.use_gpu, - num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, - enable_memory_optim=True, - strategy=strategy) + strategy=hub.AdamWeightDecayStrategy()) # Define a reading comprehension finetune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( @@ -125,5 +92,5 @@ if __name__ == '__main__': config=config) # Data to be predicted - data = dataset.dev_examples[97:98] + data = dataset.dev_examples[:10] reading_comprehension_task.predict(data=data) diff --git a/demo/reading-comprehension/reading_comprehension.py b/demo/reading-comprehension/reading_comprehension.py index 85bdada1..11fe241d 100644 --- a/demo/reading-comprehension/reading_comprehension.py +++ b/demo/reading-comprehension/reading_comprehension.py @@ -31,38 +31,22 @@ parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight dec parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.") -parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") -parser.add_argument("--n_best_size", type=int, default=20,help="The total number of n-best predictions to generate in the ""nbest_predictions.json output file.") -parser.add_argument("--max_answer_length", type=int, default=30,help="The maximum length of an answer that can be generated. This is needed ""because the start and end predictions are not conditioned on one another.") parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") -parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") -parser.add_argument("--dataset", type=str, default="squad", help="Support squad, squad2.0, drcd and cmrc2018") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Download dataset and use ReadingComprehensionReader to read dataset - if args.dataset == "squad": - dataset = hub.dataset.SQUAD(version_2_with_negative=False) - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset == "squad2.0" or args.dataset == "squad2": - args.dataset = "squad2.0" - dataset = hub.dataset.SQUAD(version_2_with_negative=True) - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset == "drcd": - dataset = hub.dataset.DRCD() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - elif args.dataset == "cmrc2018": - dataset = hub.dataset.CMRC2018() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - else: - raise Exception( - "Only support datasets: squad, squad2.0, drcd and cmrc2018") - + # Load Paddlehub BERT pretrained model + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + # Download dataset and use ReadingComprehensionReader to read dataset + # If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True + dataset = hub.dataset.SQUAD(version_2_with_negative=False) + # dataset = hub.dataset.SQUAD(version_2_with_negative=True) + reader = hub.reader.ReadingComprehensionReader( dataset=dataset, vocab_path=module.get_vocab_path(), @@ -84,19 +68,16 @@ if __name__ == '__main__': strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, - warmup_proportion=args.warmup_proportion, - lr_scheduler="linear_decay") + warmup_proportion=args.warmup_proportion) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( eval_interval=300, - use_pyreader=args.use_pyreader, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, - enable_memory_optim=True, strategy=strategy) # Define a reading comprehension finetune task by PaddleHub's API @@ -105,7 +86,7 @@ if __name__ == '__main__': feature=seq_output, feed_list=feed_list, config=config, - sub_task=args.dataset, + sub_task="squad", ) # Finetune by PaddleHub's API diff --git a/demo/reading-comprehension/run_finetune.sh b/demo/reading-comprehension/run_finetune.sh index 9d92042c..ba862648 100644 --- a/demo/reading-comprehension/run_finetune.sh +++ b/demo/reading-comprehension/run_finetune.sh @@ -1,20 +1,19 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -# Recommending hyper parameters for difference task -# squad: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5 -# squad2.0: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5 +# The suggested hyper parameters for difference task +# squad: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=3e-5 +# squad2.0: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=3e-5 # cmrc2018: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5 # drcd: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5 -dataset=cmrc2018 python -u reading_comprehension.py \ --batch_size=8 \ --use_gpu=True \ - --checkpoint_dir=./ckpt_${dataset} \ - --learning_rate=2.5e-5 \ + --checkpoint_dir="./ckpt_squad" \ + --learning_rate=3e-5 \ --weight_decay=0.01 \ --warmup_proportion=0.1 \ --num_epoch=2 \ --max_seq_len=512 \ - --dataset=${dataset} + --use_data_parallel=True diff --git a/demo/reading-comprehension/run_predict.sh b/demo/reading-comprehension/run_predict.sh index 456f7dc2..b6512192 100644 --- a/demo/reading-comprehension/run_predict.sh +++ b/demo/reading-comprehension/run_predict.sh @@ -1,18 +1,8 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_cmrc2018" -dataset=cmrc2018 - python -u predict.py \ - --batch_size=8 \ + --batch_size=1 \ --use_gpu=True \ - --dataset=${dataset} \ - --checkpoint_dir=${CKPT_DIR} \ - --learning_rate=2.5e-5 \ - --weight_decay=0.01 \ - --warmup_proportion=0.1 \ - --num_epoch=1 \ + --checkpoint_dir="./ckpt_squad" \ --max_seq_len=512 \ - --use_pyreader=False \ - --use_data_parallel=False diff --git a/demo/regression/predict.py b/demo/regression/predict.py index 4f0fb959..e98111bd 100644 --- a/demo/regression/predict.py +++ b/demo/regression/predict.py @@ -34,29 +34,17 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") -parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - dataset = None - metrics_choices = [] - # Download dataset and use ClassifyReader to read dataset - if args.dataset.lower() == "sts-b": - dataset = hub.dataset.GLUE("STS-B") - module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - metrics_choices = ["acc"] - else: - raise ValueError("%s dataset is not defined" % args.dataset) - - support_metrics = ["acc", "f1", "matthews"] - for metric in metrics_choices: - if metric not in support_metrics: - raise ValueError("\"%s\" metric is not defined" % metric) - + # Load Paddlehub ERNIE 2.0 pretrained model + module = hub.Module(name="ernie_v2_eng_base") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use RegressionReader to read dataset + dataset = hub.dataset.GLUE("STS-B") reader = hub.reader.RegressionReader( dataset=dataset, vocab_path=module.get_vocab_path(), @@ -79,35 +67,27 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, - strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + strategy=hub.AdamWeightDecayStrategy()) # Define a regression finetune task by PaddleHub's API reg_task = hub.RegressionTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, - config=config) + config=config, + ) # Data to be prdicted - data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()] + data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]] index = 0 run_states = reg_task.predict(data=data) results = [run_state.run_results for run_state in run_states] - if not os.path.exists("output"): - os.makedirs("output") - fout = open(os.path.join("output", "%s.tsv" % args.dataset.upper()), 'w') - fout.write("index\tprediction") for batch_result in results: for result in batch_result[0]: - if index < 3: - print("%s\t%s\tpredict=%.3f" % (data[index][0], data[index][1], - result[0])) - fout.write("\n%s\t%.3f" % (index, result[0])) + print("text:%s\t%s\tpredict:%.3f" % (data[index][0], data[index][1], + result[0])) index += 1 - fout.close() diff --git a/demo/regression/regression.py b/demo/regression/regression.py index d49dd1a9..e2c1c0bf 100644 --- a/demo/regression/regression.py +++ b/demo/regression/regression.py @@ -24,30 +24,25 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") -parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") +parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - dataset = None - # Download dataset and use ClassifyReader to read dataset - if args.dataset.lower() == "sts-b": - dataset = hub.dataset.GLUE("STS-B") - module = hub.Module(name="ernie_v2_eng_base") - else: - raise ValueError("%s dataset is not defined" % args.dataset) + # Load Paddlehub ERNIE 2.0 pretrained model + module = hub.Module(name="ernie_v2_eng_base") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use RegressionReader to read dataset + dataset = hub.dataset.GLUE("STS-B") reader = hub.reader.RegressionReader( dataset=dataset, vocab_path=module.get_vocab_path(), @@ -69,14 +64,14 @@ if __name__ == '__main__': # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay") + learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( + eval_interval=300, use_data_parallel=args.use_data_parallel, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, diff --git a/demo/regression/run_predict.sh b/demo/regression/run_predict.sh index 34ce6fe3..55268caf 100644 --- a/demo/regression/run_predict.sh +++ b/demo/regression/run_predict.sh @@ -1,13 +1,9 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task -DATASET="STS-B" -CKPT_DIR="./ckpt_${DATASET}" -# STS-B: batch_size=32, max_seq_len=128 +CKPT_DIR="./ckpt_stsb" -python -u predict.py --checkpoint_dir $CKPT_DIR \ +python -u predict.py --checkpoint_dir ${CKPT_DIR} \ --max_seq_len 128 \ --use_gpu True \ - --dataset=${DATASET} \ - --batch_size=32 \ + --batch_size=1 \ diff --git a/demo/regression/run_regssion.sh b/demo/regression/run_regssion.sh index 29de8309..9866cea4 100644 --- a/demo/regression/run_regssion.sh +++ b/demo/regression/run_regssion.sh @@ -1,19 +1,15 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -DATASET="STS-B" -CKPT_DIR="./ckpt_${DATASET}" -# Recommending hyper parameters for difference task -# STS-B: batch_size=32, weight_decay=0.1, num_epoch=3, max_seq_len=128, lr=4e-5 +CKPT_DIR="./ckpt_stsb" python -u regression.py \ --batch_size=32 \ --use_gpu=True \ - --dataset=${DATASET} \ --checkpoint_dir=${CKPT_DIR} \ --learning_rate=4e-5 \ + --warmup_proportion=0.1 \ --weight_decay=0.1 \ --max_seq_len=128 \ --num_epoch=3 \ - --use_pyreader=True \ - --use_data_parallel=True + --use_data_parallel=False diff --git a/demo/senta/cli_demo.sh b/demo/senta/cli_demo.sh deleted file mode 100644 index eba1dfc2..00000000 --- a/demo/senta/cli_demo.sh +++ /dev/null @@ -1 +0,0 @@ -python ../../paddlehub/commands/hub.py run senta_bilstm --input_file test/test.txt diff --git a/demo/senta/predict.py b/demo/senta/predict.py index dfb5017f..82cf8a5c 100644 --- a/demo/senta/predict.py +++ b/demo/senta/predict.py @@ -17,6 +17,7 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch when the program predicts.") args = parser.parse_args() # yapf: enable. @@ -25,31 +26,26 @@ if __name__ == '__main__': module = hub.Module(name="senta_bilstm") inputs, outputs, program = module.context(trainable=True) - # Sentence classification dataset reader + # Download dataset and use LACClassifyReader to read dataset dataset = hub.dataset.ChnSentiCorp() reader = hub.reader.LACClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path()) - strategy = hub.AdamWeightDecayStrategy( - weight_decay=0.01, - warmup_proportion=0.1, - learning_rate=5e-5, - lr_scheduler="linear_decay", - optimizer_name="adam") + sent_feature = outputs["sentence_feature"] + + # Setup feed list for data feeder + # Must feed all the tensor of senta's module need + feed_list = [inputs["words"].name] + # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=False, use_cuda=args.use_gpu, - batch_size=1, - enable_memory_optim=False, + batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - sent_feature = outputs["sentence_feature"] - - feed_list = [inputs["words"].name] + strategy=hub.AdamWeightDecayStrategy()) + # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask( data_reader=reader, feature=sent_feature, @@ -57,9 +53,12 @@ if __name__ == '__main__': num_classes=dataset.num_labels, config=config) + # Data to be predicted data = ["这家餐厅很好吃", "这部电影真的很差劲"] + # Predict by PaddleHub's API run_states = cls_task.predict(data=data) + results = [run_state.run_results for run_state in run_states] index = 0 for batch_result in results: diff --git a/demo/senta/run_finetune.sh b/demo/senta/run_finetune.sh index e64efe99..2ab38301 100644 --- a/demo/senta/run_finetune.sh +++ b/demo/senta/run_finetune.sh @@ -1,11 +1,10 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -DATASET="chnsenticorp" -CKPT_DIR="./ckpt_${DATASET}" +CKPT_DIR="./ckpt_chnsenticorp" python -u senta_finetune.py \ --batch_size=24 \ - --use_gpu=False \ + --use_gpu=True \ --checkpoint_dir=${CKPT_DIR} \ --num_epoch=3 diff --git a/demo/senta/run_predict.sh b/demo/senta/run_predict.sh index f2704a2f..2e2f1ce4 100644 --- a/demo/senta/run_predict.sh +++ b/demo/senta/run_predict.sh @@ -1,5 +1,5 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_chnsenticorp/best_model" -python -u predict.py --checkpoint_dir $CKPT_DIR --use_gpu False +CKPT_DIR="./ckpt_chnsenticorp " +python -u predict.py --checkpoint_dir $CKPT_DIR --use_gpu True diff --git a/demo/senta/senta_demo.py b/demo/senta/senta_demo.py index 328517d3..a17c9ce3 100644 --- a/demo/senta/senta_demo.py +++ b/demo/senta/senta_demo.py @@ -11,10 +11,11 @@ if __name__ == "__main__": # Load Senta-BiLSTM module senta = hub.Module(name="senta_bilstm") + # Data to be predicted test_text = ["这家餐厅很好吃", "这部电影真的很差劲"] + # execute predict and print the result input_dict = {"text": test_text} - results = senta.sentiment_classify(data=input_dict) for index, text in enumerate(test_text): diff --git a/demo/senta/senta_finetune.py b/demo/senta/senta_finetune.py index 49eaa171..18b0a092 100644 --- a/demo/senta/senta_finetune.py +++ b/demo/senta/senta_finetune.py @@ -15,13 +15,12 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Step1: load Paddlehub senta pretrained model + # Load Paddlehub senta pretrained model module = hub.Module(name="senta_bilstm") inputs, outputs, program = module.context(trainable=True) - # Step2: Download dataset and use LACClassifyReader to read dataset + # Download dataset and use LACClassifyReader to read dataset dataset = hub.dataset.ChnSentiCorp() - reader = hub.reader.LACClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path()) @@ -31,16 +30,15 @@ if __name__ == '__main__': # Must feed all the tensor of senta's module need feed_list = [inputs["words"].name] - strategy = hub.finetune.strategy.AdamWeightDecayStrategy( - learning_rate=1e-4, weight_decay=0.01, warmup_proportion=0.05) - + # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_cuda=args.use_gpu, + use_pyreader=False, + use_data_parallel=False, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, - use_pyreader=False, - strategy=strategy) + strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask( diff --git a/demo/sequence-labeling/predict.py b/demo/sequence-labeling/predict.py index 81419196..3c45b1a3 100644 --- a/demo/sequence-labeling/predict.py +++ b/demo/sequence-labeling/predict.py @@ -35,7 +35,6 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") args = parser.parse_args() # yapf: enable. @@ -52,10 +51,8 @@ if __name__ == '__main__': max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) - inv_label_map = {val: key for key, val in reader.label_map.items()} - place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) + inv_label_map = {val: key for key, val in reader.label_map.items()} # Construct transfer learning network # Use "sequence_output" for token-level output. @@ -73,10 +70,8 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) @@ -91,7 +86,7 @@ if __name__ == '__main__': config=config, add_crf=True) - # test data + # Data to be predicted data = [ ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], ["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], diff --git a/demo/sequence-labeling/run_sequence_label.sh b/demo/sequence-labeling/run_sequence_label.sh index ab5eef90..bb78fde6 100644 --- a/demo/sequence-labeling/run_sequence_label.sh +++ b/demo/sequence-labeling/run_sequence_label.sh @@ -9,5 +9,5 @@ python -u sequence_label.py \ --checkpoint_dir $CKPT_DIR \ --max_seq_len 128 \ --learning_rate 5e-5 \ - --use_pyreader True \ + --warmup_proportion 0.1 \ --use_data_parallel True diff --git a/demo/sequence-labeling/sequence_label.py b/demo/sequence-labeling/sequence_label.py index 52cfb665..a2b283e8 100644 --- a/demo/sequence-labeling/sequence_label.py +++ b/demo/sequence-labeling/sequence_label.py @@ -26,17 +26,16 @@ parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") +parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Load Paddlehub ERNIE pretrained model + # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) @@ -55,8 +54,7 @@ if __name__ == '__main__': sequence_output = outputs["sequence_output"] # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - # Compared to classification task, we need add seq_len tensor to feedlist + # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name @@ -64,15 +62,13 @@ if __name__ == '__main__': # Select a finetune strategy strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay", - ) + learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, @@ -80,7 +76,7 @@ if __name__ == '__main__': strategy=strategy) # Define a sequence labeling finetune task by PaddleHub's API - # if add crf, the network use crf as decoder + # If add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask( data_reader=reader, feature=sequence_output, diff --git a/demo/ssd/cli_demo.sh b/demo/ssd/cli_demo.sh deleted file mode 100644 index c1755a76..00000000 --- a/demo/ssd/cli_demo.sh +++ /dev/null @@ -1 +0,0 @@ -python ../../paddlehub/commands/hub.py run ssd_mobilenet_v1_pascal --input_file test/test.txt diff --git a/demo/text-classification/predict.py b/demo/text-classification/predict.py index b044a815..8aaac3ab 100644 --- a/demo/text-classification/predict.py +++ b/demo/text-classification/predict.py @@ -33,99 +33,27 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") -parser.add_argument("--dataset", type=str, default="chnsenticorp", help="The choice of dataset") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - dataset = None - metrics_choices = [] - # Download dataset and use ClassifyReader to read dataset - if args.dataset.lower() == "chnsenticorp": - dataset = hub.dataset.ChnSentiCorp() - module = hub.Module(name="ernie_tiny") - metrics_choices = ["acc"] - elif args.dataset.lower() == "tnews": - dataset = hub.dataset.TNews() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "nlpcc_dbqa": - dataset = hub.dataset.NLPCC_DBQA() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "lcqmc": - dataset = hub.dataset.LCQMC() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'inews': - dataset = hub.dataset.INews() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'bq': - dataset = hub.dataset.BQ() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'thucnews': - dataset = hub.dataset.THUCNEWS() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'iflytek': - dataset = hub.dataset.IFLYTEK() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mrpc": - dataset = hub.dataset.GLUE("MRPC") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["f1", "acc"] - # The first metric will be choose to eval. Ref: task.py:799 - elif args.dataset.lower() == "qqp": - dataset = hub.dataset.GLUE("QQP") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["f1", "acc"] - elif args.dataset.lower() == "sst-2": - dataset = hub.dataset.GLUE("SST-2") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "cola": - dataset = hub.dataset.GLUE("CoLA") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["matthews", "acc"] - elif args.dataset.lower() == "qnli": - dataset = hub.dataset.GLUE("QNLI") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "rte": - dataset = hub.dataset.GLUE("RTE") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli_m": - dataset = hub.dataset.GLUE("MNLI_m") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mnli_mm": - dataset = hub.dataset.GLUE("MNLI_mm") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower().startswith("xnli"): - dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:]) - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - else: - raise ValueError("%s dataset is not defined" % args.dataset) - - support_metrics = ["acc", "f1", "matthews"] - for metric in metrics_choices: - if metric not in support_metrics: - raise ValueError("\"%s\" metric is not defined" % metric) - + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + dataset = hub.dataset.ChnSentiCorp() + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( - dataset=dataset, vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. @@ -133,7 +61,7 @@ if __name__ == '__main__': pooled_output = outputs["pooled_output"] # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need + # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, @@ -143,13 +71,11 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( - use_data_parallel=False, - use_pyreader=args.use_pyreader, + use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, batch_size=args.batch_size, - enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, - strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask( @@ -157,11 +83,11 @@ if __name__ == '__main__': feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, - config=config, - metrics_choices=metrics_choices) + config=config) # Data to be prdicted - data = [[d.text_a, d.text_b] for d in dataset.get_dev_examples()[:3]] + data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], + ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] index = 0 run_states = cls_task.predict(data=data) diff --git a/demo/text-classification/run_classifier.sh b/demo/text-classification/run_classifier.sh index c7e5d329..c7bfc95e 100644 --- a/demo/text-classification/run_classifier.sh +++ b/demo/text-classification/run_classifier.sh @@ -1,23 +1,20 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task -DATASET="chnsenticorp" -CKPT_DIR="./ckpt_${DATASET}" +CKPT_DIR="./ckpt_chnsenticorp" python -u text_classifier.py \ --batch_size=24 \ --use_gpu=True \ - --dataset=${DATASET} \ --checkpoint_dir=${CKPT_DIR} \ --learning_rate=5e-5 \ --weight_decay=0.01 \ --max_seq_len=128 \ + --warmup_proportion=0.1 \ --num_epoch=3 \ - --use_pyreader=True \ --use_data_parallel=True -# Recommending hyper parameters for difference task +# The sugguested hyper parameters for difference task # for ChineseGLUE: # TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 # LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 diff --git a/demo/text-classification/run_predict.sh b/demo/text-classification/run_predict.sh index f8badbb0..5daba182 100644 --- a/demo/text-classification/run_predict.sh +++ b/demo/text-classification/run_predict.sh @@ -1,20 +1,9 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task -# Support ChnSentiCorp NLPCC_DBQA LCQMC MRPC QQP SST-2 -# CoLA QNLI RTE MNLI (or MNLI_m) MNLI_mm) XNLI -# for XNLI: Specify the language with an underscore like xnli_zh. -# ar: Arabic bg: Bulgarian de: German -# el: Greek en: English es: Spanish -# fr: French hi: Hindi ru: Russian -# sw: Swahili th: Thai tr: Turkish -# ur: Urdu vi: Vietnamese zh: Chinese (Simplified) -DATASET="ChnSentiCorp" -CKPT_DIR="./ckpt_${DATASET}" +CKPT_DIR="./ckpt_chnsenticorp" python -u predict.py --checkpoint_dir=$CKPT_DIR \ --max_seq_len=128 \ --use_gpu=True \ - --dataset=${DATASET} \ - --batch_size=32 \ + --batch_size=24 \ diff --git a/demo/text-classification/text_classifier.py b/demo/text-classification/text_classifier.py index 155d9e6d..6d960696 100644 --- a/demo/text-classification/text_classifier.py +++ b/demo/text-classification/text_classifier.py @@ -23,106 +23,38 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--dataset", type=str, default="chnsenticorp", help="The choice of dataset") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") -parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") +parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - dataset = None - metrics_choices = [] - # Download dataset and use ClassifyReader to read dataset - if args.dataset.lower() == "chnsenticorp": - dataset = hub.dataset.ChnSentiCorp() - module = hub.Module(name="ernie_tiny") - metrics_choices = ["acc"] - elif args.dataset.lower() == "tnews": - dataset = hub.dataset.TNews() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "nlpcc_dbqa": - dataset = hub.dataset.NLPCC_DBQA() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "lcqmc": - dataset = hub.dataset.LCQMC() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'inews': - dataset = hub.dataset.INews() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'bq': - dataset = hub.dataset.BQ() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'thucnews': - dataset = hub.dataset.THUCNEWS() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == 'iflytek': - dataset = hub.dataset.IFLYTEK() - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mrpc": - dataset = hub.dataset.GLUE("MRPC") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["f1", "acc"] - # The first metric will be choose to eval. Ref: task.py:799 - elif args.dataset.lower() == "qqp": - dataset = hub.dataset.GLUE("QQP") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["f1", "acc"] - elif args.dataset.lower() == "sst-2": - dataset = hub.dataset.GLUE("SST-2") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "cola": - dataset = hub.dataset.GLUE("CoLA") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["matthews", "acc"] - elif args.dataset.lower() == "qnli": - dataset = hub.dataset.GLUE("QNLI") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "rte": - dataset = hub.dataset.GLUE("RTE") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli_m": - dataset = hub.dataset.GLUE("MNLI_m") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower() == "mnli_mm": - dataset = hub.dataset.GLUE("MNLI_mm") - module = hub.Module(name="ernie_v2_eng_base") - metrics_choices = ["acc"] - elif args.dataset.lower().startswith("xnli"): - dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:]) - module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") - metrics_choices = ["acc"] - else: - raise ValueError("%s dataset is not defined" % args.dataset) - # Check metric - support_metrics = ["acc", "f1", "matthews"] - for metric in metrics_choices: - if metric not in support_metrics: - raise ValueError("\"%s\" metric is not defined" % metric) - - # Start preparing parameters for reader and task accoring to module - # For ernie_v2, it has an addition embedding named task_id - # For ernie_v2_chinese_tiny, it use an addition sentence_piece_vocab to tokenize + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + # metric should be acc, f1 or matthews + dataset = hub.dataset.ChnSentiCorp() + metrics_choices = ["acc"] + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None + reader = hub.reader.ClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. @@ -136,26 +68,16 @@ if __name__ == '__main__': inputs["segment_ids"].name, inputs["input_mask"].name, ] - # Finish preparing parameter for reader and task accoring to modul - - # Define reader - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - lr_scheduler="linear_decay") + learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, - use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, -- GitLab