提交 f04de234 编写于 作者: S Steffy-zxf 提交者: wuzewu

Update the demo text-cls and seq-label (#47)

* Update teh demo text-cls and seq-label
上级 d64a1b6d
......@@ -33,6 +33,7 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
args = parser.parse_args()
# yapf: enable.
......@@ -40,8 +41,7 @@ args = parser.parse_args()
if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
input_dict, output_dict, program = module.context(
max_seq_len=args.max_seq_len)
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader
dataset = hub.dataset.MSRA_NER()
......@@ -53,70 +53,61 @@ if __name__ == '__main__':
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.program_guard(program):
# Use "sequence_outputs" for token-level output.
sequence_output = output_dict["sequence_output"]
# Define a classfication finetune task by PaddleHub's API
seq_label_task = hub.create_seq_label_task(
feature=sequence_output,
num_classes=dataset.num_labels,
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Compared to classification task, we need add seq_len tensor to feedlist
feed_list = [
input_dict["input_ids"].name, input_dict["position_ids"].name,
input_dict["segment_ids"].name, input_dict["input_mask"].name,
seq_label_task.variable('label').name,
seq_label_task.variable('seq_len').name
]
fetch_list = [
seq_label_task.variable("labels").name,
seq_label_task.variable("infers").name,
seq_label_task.variable("seq_len").name
]
# classification probability tensor
probs = seq_label_task.variable("probs")
# load best model checkpoint
fluid.io.load_persistables(exe, args.checkpoint_dir)
inference_program = program.clone(for_test=True)
# calculate the num of label from probs variable shape
num_labels = seq_label_task.variable("probs").shape[1]
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
test_reader = reader.data_generator(phase='test', shuffle=False)
test_examples = dataset.get_test_examples()
total_label, total_infer, total_correct = 0.0, 0.0, 0.0
for index, batch in enumerate(test_reader()):
np_labels, np_infers, np_lens = exe.run(
feed=data_feeder.feed(batch),
fetch_list=fetch_list,
program=inference_program)
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, num_labels)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
labels = np_labels.reshape([-1]).astype(np.int32).tolist()
label_str = ""
count = 0
for label_val in labels:
label_str += inv_label_map[label_val]
count += 1
if count == np_lens:
break
print("%s\tpredict=%s" % (test_examples[index], label_str))
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
print("F1-Score=%f, precision=%f, recall=%f " % (f1, precision, recall))
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
batch_size=args.batch_size,
enable_memory_optim=False,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config)
# test data
data = [
["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"],
["其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"],
["有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"],
["不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
]
results = seq_label_task.predict(data=data)[0]
np_infers = results[0]
np_lens = results[1]
for index, text in enumerate(data):
labels = np_infers.reshape([-1]).astype(
np.int32).tolist()[args.max_seq_len * index:args.max_seq_len *
(index + 1)]
label_str = ""
count = 0
for label_val in labels:
label_str += inv_label_map[label_val]
count += 1
if count == (np_lens[index]):
break
# Drop the label results of CLS and SEP Token
print("%s\tpredict=%s" % (text[0], label_str[1:-1]))
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_sequence_label/best_model"
CKPT_DIR="./ckpt_sequence_label"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True
......@@ -34,36 +34,28 @@ args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Step1: load Paddlehub ERNIE pretrained model
# Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use SequenceLabelReader to read dataset
# Download dataset and use SequenceLabelReader to read dataset
dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Step3: construct transfer learning network
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task = hub.create_seq_label_task(
feature=sequence_output,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Compared to classification task, we need add seq_len tensor to feedlist
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
seq_label_task.variable('label').name,
seq_label_task.variable('seq_len').name
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Select a finetune strategy
......@@ -81,10 +73,15 @@ if __name__ == '__main__':
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Finetune and evaluate model by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=seq_label_task,
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config)
# Finetune and evaluate model by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
seq_label_task.finetune_and_eval()
......@@ -31,6 +31,7 @@ import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
args = parser.parse_args()
......@@ -39,8 +40,7 @@ args = parser.parse_args()
if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
input_dict, output_dict, program = module.context(
max_seq_len=args.max_seq_len)
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence classification dataset reader
dataset = hub.dataset.ChnSentiCorp()
......@@ -51,46 +51,53 @@ if __name__ == '__main__':
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.program_guard(program):
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
pooled_output = output_dict["pooled_output"]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task(
feature=pooled_output, num_classes=dataset.num_labels)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
input_dict["input_ids"].name, input_dict["position_ids"].name,
input_dict["segment_ids"].name, input_dict["input_mask"].name,
cls_task.variable('label').name
]
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# classificatin probability tensor
probs = cls_task.variable("probs")
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
pred = fluid.layers.argmax(probs, axis=1)
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
batch_size=args.batch_size,
enable_memory_optim=False,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
# load best model checkpoint
fluid.io.load_persistables(exe, args.checkpoint_dir)
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
inference_program = program.clone(for_test=True)
# Data to be prdicted
data = [
["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
[
"还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。"
],
[
"前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦"
], ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]
]
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
test_reader = reader.data_generator(phase='test', shuffle=False)
test_examples = dataset.get_test_examples()
total = 0
correct = 0
for index, batch in enumerate(test_reader()):
pred_v = exe.run(
feed=data_feeder.feed(batch),
fetch_list=[pred.name],
program=inference_program)
total += 1
if (pred_v[0][0] == int(test_examples[index].label)):
correct += 1
acc = 1.0 * correct / total
print("%s\tpredict=%s" % (test_examples[index], pred_v[0][0]))
print("accuracy = %f" % acc)
index = 0
results = cls_task.predict(data=data)
for batch_result in results:
# get predict index
batch_result = np.argmax(batch_result, axis=2)[0]
for result in batch_result:
print("%s\tpredict=%s" % (data[index][0], result))
index += 1
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_chnsenticorp/best_model"
CKPT_DIR="./ckpt_chnsenticorp"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False
......@@ -36,13 +36,13 @@ args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Step1: load Paddlehub ERNIE pretrained model
# Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
# module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use ClassifyReader to read dataset
# Download dataset and use ClassifyReader to read dataset
dataset = None
if args.dataset.lower() == "chnsenticorp":
dataset = hub.dataset.ChnSentiCorp()
......@@ -58,29 +58,25 @@ if __name__ == '__main__':
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Step3: construct transfer learning network
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task(
feature=pooled_output, num_classes=dataset.num_labels)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
cls_task.variable('label').name
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Step4: Select finetune strategy, setup config and finetune
# Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
lr_scheduler="linear_decay",
)
lr_scheduler="linear_decay")
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
......@@ -90,7 +86,14 @@ if __name__ == '__main__':
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=cls_task, data_reader=reader, feed_list=feed_list, config=config)
cls_task.finetune_and_eval()
......@@ -743,6 +743,14 @@ class SequenceLabelTask(BasicTask):
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
logits = self.logits
logits = fluid.layers.flatten(logits, axis=2)
logits = fluid.layers.softmax(logits)
......@@ -761,13 +769,8 @@ class SequenceLabelTask(BasicTask):
return loss
def _add_metrics(self):
ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1])
ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
return [ret_labels, ret_infers, seq_len]
self.ret_labels = fluid.layers.reshape(x=self.label, shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len]
def _build_env_end_event(self):
with self.log_writer.mode(self.phase) as logw:
......@@ -834,4 +837,14 @@ class SequenceLabelTask(BasicTask):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.label.name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
return feed_list
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name]
return [self.output.name]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册