提交 b9ad9775 编写于 作者: B BinLong

Merge branch 'develop' of github.com:PaddlePaddle/PaddleHub into develop

...@@ -36,6 +36,8 @@ $ pip install --upgrade paddlepaddle ...@@ -36,6 +36,8 @@ $ pip install --upgrade paddlepaddle
--checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt --checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt
--dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat/stanforddogs/indoor67/food101}。默认为flowers --dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat/stanforddogs/indoor67/food101}。默认为flowers
--use_gpu: 是否使用GPU进行训练,如果机器支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭 --use_gpu: 是否使用GPU进行训练,如果机器支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭
--use_data_parallel: 是否使用数据并行,打开该开关时,会将数据分散到不同的卡上进行训练(CPU下会分布到不同线程)。默认关闭
--use_pyreader: 是否使用pyreader进行数据喂入。默认关闭
``` ```
## 进行预测 ## 进行预测
...@@ -51,6 +53,7 @@ $ pip install --upgrade paddlepaddle ...@@ -51,6 +53,7 @@ $ pip install --upgrade paddlepaddle
--checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt --checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt
--dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat}。默认为flowers --dataset: 使用什么数据集进行finetune, 脚本支持分别是{flowers/dogcat}。默认为flowers
--use_gpu: 使用使用GPU进行训练,如果本机支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭 --use_gpu: 使用使用GPU进行训练,如果本机支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭
--use_pyreader: 是否使用pyreader进行数据喂入。默认关闭
``` ```
`注意`:进行预测时,所选择的module,checkpoint_dir,dataset必须和finetune所用的一样 `注意`:进行预测时,所选择的module,checkpoint_dir,dataset必须和finetune所用的一样
#coding:utf-8 #coding:utf-8
import argparse import argparse
import os import os
import ast
import paddle.fluid as fluid import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
...@@ -8,12 +9,14 @@ import numpy as np ...@@ -8,12 +9,14 @@ import numpy as np
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=bool, default=True, help="Whether use GPU for fine-tuning.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning.")
parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.")
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.") parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.")
parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
# yapf: enable. # yapf: enable.
module_map = { module_map = {
...@@ -56,6 +59,8 @@ def finetune(args): ...@@ -56,6 +59,8 @@ def finetune(args):
feed_list = [img.name] feed_list = [img.name]
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
batch_size=args.batch_size, batch_size=args.batch_size,
......
#coding:utf-8 #coding:utf-8
import argparse import argparse
import os import os
import ast
import paddle.fluid as fluid import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
...@@ -8,11 +9,12 @@ import numpy as np ...@@ -8,11 +9,12 @@ import numpy as np
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--use_gpu", type=bool, default=False, help="Whether use GPU for predict.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for predict.")
parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.")
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
parser.add_argument("--module", type=str, default="resnet50", help="Module used as a feature extractor.") parser.add_argument("--module", type=str, default="resnet50", help="Module used as a feature extractor.")
parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
# yapf: enable. # yapf: enable.
module_map = { module_map = {
...@@ -56,6 +58,7 @@ def predict(args): ...@@ -56,6 +58,7 @@ def predict(args):
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
batch_size=args.batch_size, batch_size=args.batch_size,
enable_memory_optim=False, enable_memory_optim=False,
......
...@@ -19,10 +19,10 @@ if __name__ == "__main__": ...@@ -19,10 +19,10 @@ if __name__ == "__main__":
results = lac.lexical_analysis(data=inputs) results = lac.lexical_analysis(data=inputs)
for result in results: for result in results:
if six.PY2: if six.PY2:
print(json.dumps( print(
result['word'], encoding="utf8", ensure_ascii=False)) json.dumps(result['word'], encoding="utf8", ensure_ascii=False))
print(json.dumps( print(
result['tag'], encoding="utf8", ensure_ascii=False)) json.dumps(result['tag'], encoding="utf8", ensure_ascii=False))
else: else:
print(result['word']) print(result['word'])
print(result['tag']) print(result['tag'])
...@@ -21,7 +21,7 @@ if __name__ == "__main__": ...@@ -21,7 +21,7 @@ if __name__ == "__main__":
results[index]["text"] = text results[index]["text"] = text
for index, result in enumerate(results): for index, result in enumerate(results):
if six.PY2: if six.PY2:
print(json.dumps( print(
results[index], encoding="utf8", ensure_ascii=False)) json.dumps(results[index], encoding="utf8", ensure_ascii=False))
else: else:
print(results[index]) print(results[index])
...@@ -33,15 +33,16 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 ...@@ -33,15 +33,16 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model # loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie")
input_dict, output_dict, program = module.context( inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader # Sentence labeling dataset reader
dataset = hub.dataset.MSRA_NER() dataset = hub.dataset.MSRA_NER()
...@@ -53,70 +54,67 @@ if __name__ == '__main__': ...@@ -53,70 +54,67 @@ if __name__ == '__main__':
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
with fluid.program_guard(program):
# Use "sequence_outputs" for token-level output. # Construct transfer learning network
sequence_output = output_dict["sequence_output"] # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Define a classfication finetune task by PaddleHub's API
seq_label_task = hub.create_seq_label_task( # Setup feed list for data feeder
feature=sequence_output, # Must feed all the tensor of ERNIE's module need
num_classes=dataset.num_labels, feed_list = [
max_seq_len=args.max_seq_len) inputs["input_ids"].name,
inputs["position_ids"].name,
# Setup feed list for data feeder inputs["segment_ids"].name,
# Must feed all the tensor of ERNIE's module need inputs["input_mask"].name,
# Compared to classification task, we need add seq_len tensor to feedlist ]
feed_list = [
input_dict["input_ids"].name, input_dict["position_ids"].name, # Setup runing config for PaddleHub Finetune API
input_dict["segment_ids"].name, input_dict["input_mask"].name, config = hub.RunConfig(
seq_label_task.variable('label').name, use_data_parallel=False,
seq_label_task.variable('seq_len').name use_pyreader=args.use_pyreader,
] use_cuda=args.use_gpu,
batch_size=args.batch_size,
fetch_list = [ enable_memory_optim=False,
seq_label_task.variable("labels").name, checkpoint_dir=args.checkpoint_dir,
seq_label_task.variable("infers").name, strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
seq_label_task.variable("seq_len").name
] # Define a sequence labeling finetune task by PaddleHub's API
seq_label_task = hub.SequenceLabelTask(
# classification probability tensor data_reader=reader,
probs = seq_label_task.variable("probs") feature=sequence_output,
feed_list=feed_list,
# load best model checkpoint max_seq_len=args.max_seq_len,
fluid.io.load_persistables(exe, args.checkpoint_dir) num_classes=dataset.num_labels,
config=config)
inference_program = program.clone(for_test=True)
# test data
# calculate the num of label from probs variable shape data = [
num_labels = seq_label_task.variable("probs").shape[1] ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"],
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) ["其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"],
test_reader = reader.data_generator(phase='test', shuffle=False) ["有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"],
test_examples = dataset.get_test_examples() ["不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
total_label, total_infer, total_correct = 0.0, 0.0, 0.0 ]
for index, batch in enumerate(test_reader()):
np_labels, np_infers, np_lens = exe.run( results = seq_label_task.predict(data=data)
feed=data_feeder.feed(batch),
fetch_list=fetch_list, for num_batch, batch_results in enumerate(results):
program=inference_program) infers = batch_results[0].reshape([-1]).astype(np.int32).tolist()
label_num, infer_num, correct_num = chunk_eval( np_lens = batch_results[1]
np_labels, np_infers, np_lens, num_labels)
for index, np_len in enumerate(np_lens):
total_infer += infer_num labels = infers[index * args.max_seq_len:(index + 1) *
total_label += label_num args.max_seq_len]
total_correct += correct_num
labels = np_labels.reshape([-1]).astype(np.int32).tolist()
label_str = "" label_str = ""
count = 0 count = 0
for label_val in labels: for label_val in labels:
label_str += inv_label_map[label_val] label_str += inv_label_map[label_val]
count += 1 count += 1
if count == np_lens: if count == np_len:
break break
print("%s\tpredict=%s" % (test_examples[index], label_str)) # Drop the label results of CLS and SEP Token
print(
precision, recall, f1 = calculate_f1(total_label, total_infer, "%s\tpredict=%s" %
total_correct) (data[num_batch * args.batch_size + index][0], label_str[1:-1]))
print("F1-Score=%f, precision=%f, recall=%f " % (f1, precision, recall))
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_sequence_label/best_model" CKPT_DIR="./ckpt_sequence_label"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True
...@@ -7,4 +7,6 @@ python -u sequence_label.py \ ...@@ -7,4 +7,6 @@ python -u sequence_label.py \
--num_epoch 3 \ --num_epoch 3 \
--checkpoint_dir $CKPT_DIR \ --checkpoint_dir $CKPT_DIR \
--max_seq_len 256 \ --max_seq_len 256 \
--learning_rate 5e-5 --learning_rate 5e-5 \
--use_pyreader True \
--use_data_parallel True
...@@ -30,40 +30,34 @@ parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup ...@@ -30,40 +30,34 @@ parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# Step1: load Paddlehub ERNIE pretrained model # Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie")
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use SequenceLabelReader to read dataset # Download dataset and use SequenceLabelReader to read dataset
dataset = hub.dataset.MSRA_NER() dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader( reader = hub.reader.SequenceLabelReader(
dataset=dataset, dataset=dataset,
vocab_path=module.get_vocab_path(), vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len) max_seq_len=args.max_seq_len)
# Step3: construct transfer learning network # Construct transfer learning network
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"] sequence_output = outputs["sequence_output"]
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task = hub.create_seq_label_task(
feature=sequence_output,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels)
# Setup feed list for data feeder # Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need # Must feed all the tensor of ERNIE's module need
# Compared to classification task, we need add seq_len tensor to feedlist # Compared to classification task, we need add seq_len tensor to feedlist
feed_list = [ feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name, inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name, inputs["segment_ids"].name, inputs["input_mask"].name
seq_label_task.variable('label').name,
seq_label_task.variable('seq_len').name
] ]
# Select a finetune strategy # Select a finetune strategy
...@@ -75,16 +69,23 @@ if __name__ == '__main__': ...@@ -75,16 +69,23 @@ if __name__ == '__main__':
# Setup runing config for PaddleHub Finetune API # Setup runing config for PaddleHub Finetune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
batch_size=args.batch_size, batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir, checkpoint_dir=args.checkpoint_dir,
strategy=strategy) strategy=strategy)
# Finetune and evaluate model by PaddleHub's API # Define a sequence labeling finetune task by PaddleHub's API
# will finish training, evaluation, testing, save model automatically seq_label_task = hub.SequenceLabelTask(
hub.finetune_and_eval(
task=seq_label_task,
data_reader=reader, data_reader=reader,
feature=sequence_output,
feed_list=feed_list, feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config) config=config)
# Finetune and evaluate model by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
seq_label_task.finetune_and_eval()
...@@ -31,16 +31,17 @@ import paddlehub as hub ...@@ -31,16 +31,17 @@ import paddlehub as hub
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model # loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie")
input_dict, output_dict, program = module.context( inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
max_seq_len=args.max_seq_len)
# Sentence classification dataset reader # Sentence classification dataset reader
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
...@@ -51,46 +52,55 @@ if __name__ == '__main__': ...@@ -51,46 +52,55 @@ if __name__ == '__main__':
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
with fluid.program_guard(program):
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
pooled_output = output_dict["pooled_output"]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task(
feature=pooled_output, num_classes=dataset.num_labels)
# Setup feed list for data feeder # Construct transfer learning network
# Must feed all the tensor of ERNIE's module need # Use "pooled_output" for classification tasks on an entire sentence.
feed_list = [ # Use "sequence_output" for token-level output.
input_dict["input_ids"].name, input_dict["position_ids"].name, pooled_output = outputs["pooled_output"]
input_dict["segment_ids"].name, input_dict["input_mask"].name,
cls_task.variable('label').name
]
# classificatin probability tensor # Setup feed list for data feeder
probs = cls_task.variable("probs") # Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
pred = fluid.layers.argmax(probs, axis=1) # Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_data_parallel=False,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu,
batch_size=args.batch_size,
enable_memory_optim=False,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
# load best model checkpoint # Define a classfication finetune task by PaddleHub's API
fluid.io.load_persistables(exe, args.checkpoint_dir) cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
inference_program = program.clone(for_test=True) # Data to be prdicted
data = [
["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
[
"还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。"
],
[
"前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦"
], ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]
]
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) index = 0
test_reader = reader.data_generator(phase='test', shuffle=False) results = cls_task.predict(data=data)
test_examples = dataset.get_test_examples() for batch_result in results:
total = 0 # get predict index
correct = 0 batch_result = np.argmax(batch_result, axis=2)[0]
for index, batch in enumerate(test_reader()): for result in batch_result:
pred_v = exe.run( print("%s\tpredict=%s" % (data[index][0], result))
feed=data_feeder.feed(batch), index += 1
fetch_list=[pred.name],
program=inference_program)
total += 1
if (pred_v[0][0] == int(test_examples[index].label)):
correct += 1
acc = 1.0 * correct / total
print("%s\tpredict=%s" % (test_examples[index], pred_v[0][0]))
print("accuracy = %f" % acc)
export CUDA_VISIBLE_DEVICES=1 export CUDA_VISIBLE_DEVICES=0
# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task # User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
DATASET="chnsenticorp" DATASET="chnsenticorp"
...@@ -16,4 +16,6 @@ python -u text_classifier.py \ ...@@ -16,4 +16,6 @@ python -u text_classifier.py \
--learning_rate=5e-5 \ --learning_rate=5e-5 \
--weight_decay=0.01 \ --weight_decay=0.01 \
--max_seq_len=128 \ --max_seq_len=128 \
--num_epoch=3 --num_epoch=3 \
--use_pyreader=True \
--use_data_parallel=True \
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_chnsenticorp/best_model" CKPT_DIR="./ckpt_chnsenticorp"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False
...@@ -32,17 +32,19 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training ...@@ -32,17 +32,19 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# Step1: load Paddlehub ERNIE pretrained model # Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie")
# module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12") # module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use ClassifyReader to read dataset # Download dataset and use ClassifyReader to read dataset
dataset = None dataset = None
if args.dataset.lower() == "chnsenticorp": if args.dataset.lower() == "chnsenticorp":
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
...@@ -58,39 +60,44 @@ if __name__ == '__main__': ...@@ -58,39 +60,44 @@ if __name__ == '__main__':
vocab_path=module.get_vocab_path(), vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len) max_seq_len=args.max_seq_len)
# Step3: construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task(
feature=pooled_output, num_classes=dataset.num_labels)
# Setup feed list for data feeder # Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need # Must feed all the tensor of ERNIE's module need
feed_list = [ feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name, inputs["input_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name, inputs["position_ids"].name,
cls_task.variable('label').name inputs["segment_ids"].name,
inputs["input_mask"].name,
] ]
# Step4: Select finetune strategy, setup config and finetune # Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
learning_rate=args.learning_rate, learning_rate=args.learning_rate,
lr_scheduler="linear_decay", lr_scheduler="linear_decay")
)
# Setup runing config for PaddleHub Finetune API # Setup runing config for PaddleHub Finetune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
batch_size=args.batch_size, batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir, checkpoint_dir=args.checkpoint_dir,
strategy=strategy) strategy=strategy)
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
# Finetune and evaluate by PaddleHub's API # Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval( cls_task.finetune_and_eval()
task=cls_task, data_reader=reader, feed_list=feed_list, config=config)
...@@ -50,7 +50,7 @@ class ClearCommand(BaseCommand): ...@@ -50,7 +50,7 @@ class ClearCommand(BaseCommand):
def __init__(self, name): def __init__(self, name):
super(ClearCommand, self).__init__(name) super(ClearCommand, self).__init__(name)
self.show_in_help = True self.show_in_help = True
self.description = "Clear all cache data." self.description = "Clear all cached data."
def cache_dir(self): def cache_dir(self):
return CACHE_HOME return CACHE_HOME
......
...@@ -110,8 +110,17 @@ class BasicTask(object): ...@@ -110,8 +110,17 @@ class BasicTask(object):
# run config # run config
self.config = config if config else RunConfig() self.config = config if config else RunConfig()
self.place, self.device_count = hub.common.get_running_device_info( self.place = self.places[0]
self.config) self.device_count = len(self.places)
if self.config.batch_size < self.device_count:
logger.warning(
"Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions"
.format(self.config.batch_size, self.device_count))
logger.warning("Batch size automatically adjusted to {}".format(
self.device_count))
self.config._batch_size = self.device_count
self.exe = fluid.Executor(place=self.place) self.exe = fluid.Executor(place=self.place)
self.build_strategy = fluid.BuildStrategy() self.build_strategy = fluid.BuildStrategy()
if self.config.enable_memory_optim: if self.config.enable_memory_optim:
...@@ -239,6 +248,12 @@ class BasicTask(object): ...@@ -239,6 +248,12 @@ class BasicTask(object):
self.exe.run(self.env.startup_program) self.exe.run(self.env.startup_program)
self._build_env_end_event() self._build_env_end_event()
@property
def places(self):
if self.config.use_cuda:
return fluid.framework.cuda_places()
return fluid.framework.cpu_places()
@property @property
def is_train_phase(self): def is_train_phase(self):
return self.phase in ["train"] return self.phase in ["train"]
...@@ -481,6 +496,9 @@ class BasicTask(object): ...@@ -481,6 +496,9 @@ class BasicTask(object):
period_run_states = [] period_run_states = []
for run_step, batch in enumerate(self.reader(), start=1): for run_step, batch in enumerate(self.reader(), start=1):
if self.config.use_data_parallel and len(batch) < self.device_count:
continue
step_run_state = RunState(len(self.fetch_list)) step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1 step_run_state.run_step = 1
num_batch_examples = len(batch) num_batch_examples = len(batch)
...@@ -554,10 +572,10 @@ class BasicTask(object): ...@@ -554,10 +572,10 @@ class BasicTask(object):
class ClassifierTask(BasicTask): class ClassifierTask(BasicTask):
def __init__(self, def __init__(self,
data_reader,
feature, feature,
num_classes, num_classes,
feed_list, feed_list,
data_reader,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None): hidden_units=None):
...@@ -662,10 +680,10 @@ ImageClassifierTask = ClassifierTask ...@@ -662,10 +680,10 @@ ImageClassifierTask = ClassifierTask
class TextClassifierTask(ClassifierTask): class TextClassifierTask(ClassifierTask):
def __init__(self, def __init__(self,
data_reader,
feature, feature,
num_classes, num_classes,
feed_list, feed_list,
data_reader,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None): hidden_units=None):
...@@ -711,8 +729,8 @@ class SequenceLabelTask(BasicTask): ...@@ -711,8 +729,8 @@ class SequenceLabelTask(BasicTask):
feature, feature,
max_seq_len, max_seq_len,
num_classes, num_classes,
data_reader,
feed_list, feed_list,
data_reader,
startup_program=None, startup_program=None,
config=None, config=None,
): ):
...@@ -743,6 +761,14 @@ class SequenceLabelTask(BasicTask): ...@@ -743,6 +761,14 @@ class SequenceLabelTask(BasicTask):
name="cls_seq_label_out_b", name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
logits = self.logits logits = self.logits
logits = fluid.layers.flatten(logits, axis=2) logits = fluid.layers.flatten(logits, axis=2)
logits = fluid.layers.softmax(logits) logits = fluid.layers.softmax(logits)
...@@ -761,13 +787,8 @@ class SequenceLabelTask(BasicTask): ...@@ -761,13 +787,8 @@ class SequenceLabelTask(BasicTask):
return loss return loss
def _add_metrics(self): def _add_metrics(self):
ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1]) self.ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1])
ret_infers = fluid.layers.reshape( return [self.ret_labels, self.ret_infers, self.seq_len]
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
return [ret_labels, ret_infers, seq_len]
def _build_env_end_event(self): def _build_env_end_event(self):
with self.log_writer.mode(self.phase) as logw: with self.log_writer.mode(self.phase) as logw:
...@@ -834,4 +855,14 @@ class SequenceLabelTask(BasicTask): ...@@ -834,4 +855,14 @@ class SequenceLabelTask(BasicTask):
feed_list = [varname for varname in self._base_feed_list] feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase: if self.is_train_phase or self.is_test_phase:
feed_list += [self.label.name, self.seq_len.name] feed_list += [self.label.name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
return feed_list return feed_list
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name]
return [self.output.name]
...@@ -463,13 +463,22 @@ class Module(object): ...@@ -463,13 +463,22 @@ class Module(object):
with fluid.program_guard(program): with fluid.program_guard(program):
result = [] result = []
index = 0 index = 0
place = fluid.CPUPlace() if "PADDLEHUB_CUDA_ENABLE" in os.environ:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
if "PADDLEHUB_BATCH_SIZE" in os.environ:
batch_size = os.environ["PADDLEHUB_BATCH_SIZE"]
else:
batch_size = 1
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
data = self.processor.preprocess( data = self.processor.preprocess(
sign_name=sign_name, data_dict=data) sign_name=sign_name, data_dict=data)
data_format = self.processor.data_format(sign_name=sign_name) data_format = self.processor.data_format(sign_name=sign_name)
reader, feeder = _get_reader_and_feeder(data_format, data, place) reader, feeder = _get_reader_and_feeder(data_format, data, place)
reader = paddle.batch(reader, batch_size=2) reader = paddle.batch(reader, batch_size=batch_size)
for batch in reader(): for batch in reader():
data_out = exe.run( data_out = exe.run(
feed=feeder.feed(batch), feed=feeder.feed(batch),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册