未验证 提交 95395059 编写于 作者: K kinghuin 提交者: GitHub

add generation task (#727)

上级 6d331d8e
...@@ -7,4 +7,4 @@ python -u predict.py \ ...@@ -7,4 +7,4 @@ python -u predict.py \
--checkpoint_dir=$CKPT_DIR \ --checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \ --max_seq_len=128 \
--use_gpu=True \ --use_gpu=True \
--batch_size=24 --batch_size=1
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fine-tuning on classification task """
import argparse
import ast
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub ERNIE Tiny pretrained model
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.Couplet()
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.AdamWeightDecayStrategy())
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
# Define a classfication fine-tune task by PaddleHub's API
gen_task = hub.TextGenerationTask(
feature=pooled_output,
token_feature=sequence_output,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config,
metrics_choices=["bleu"])
# Data to be predicted
text_a = ["人增福寿年增岁", "风吹云乱天垂泪", "若有经心风过耳"]
# Add 0x02 between characters to match the format of training data,
# otherwise the length of prediction results will not match the input string
# if the input string contains non-Chinese characters.
formatted_text_a = list(map("\002".join, text_a))
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in formatted_text_a
]
print(
gen_task.predict(
data=encoded_data, label_list=label_list, accelerate_mode=False))
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_generation"
python -u predict.py \
--checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \
--use_gpu=True \
--batch_size=1
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_generation"
python -u text_gen.py \
--batch_size 16 \
--num_epoch 30 \
--checkpoint_dir $CKPT_DIR \
--max_seq_len 50 \
--learning_rate 5e-3 \
--cut_fraction 0.1 \
--use_data_parallel True
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fine-tuning on classification task """
import argparse
import ast
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--cut_fraction", type=float, default=0.1, help="Warmup proportion params for warmup strategy")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub ERNIE Tiny pretrained model
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.Couplet(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.ULMFiTStrategy(
learning_rate=args.learning_rate,
optimizer_name="adam",
cut_fraction=args.cut_fraction,
dis_params_layer=module.get_params_layer(),
frz_params_layer=module.get_params_layer())
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Define a classfication fine-tune task by PaddleHub's API
gen_task = hub.TextGenerationTask(
dataset=dataset,
feature=pooled_output,
token_feature=sequence_output,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config,
metrics_choices=["bleu"])
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
gen_task.finetune_and_eval()
...@@ -60,6 +60,7 @@ from .finetune.task import SequenceLabelTask ...@@ -60,6 +60,7 @@ from .finetune.task import SequenceLabelTask
from .finetune.task import MultiLabelClassifierTask from .finetune.task import MultiLabelClassifierTask
from .finetune.task import RegressionTask from .finetune.task import RegressionTask
from .finetune.task import ReadingComprehensionTask from .finetune.task import ReadingComprehensionTask
from .finetune.task import TextGenerationTask
from .finetune.config import RunConfig from .finetune.config import RunConfig
from .finetune.strategy import AdamWeightDecayStrategy from .finetune.strategy import AdamWeightDecayStrategy
from .finetune.strategy import DefaultStrategy from .finetune.strategy import DefaultStrategy
......
...@@ -52,13 +52,19 @@ def get_variable_info(var): ...@@ -52,13 +52,19 @@ def get_variable_info(var):
var_info = { var_info = {
'name': var.name, 'name': var.name,
'dtype': convert_dtype_to_string(var.dtype),
'lod_level': var.lod_level,
'shape': var.shape,
'stop_gradient': var.stop_gradient, 'stop_gradient': var.stop_gradient,
'is_data': var.is_data, 'is_data': var.is_data,
'error_clip': var.error_clip 'error_clip': var.error_clip,
'type': var.type
} }
try:
var_info['dtype'] = convert_dtype_to_string(var.dtype)
var_info['lod_level'] = var.lod_level
var_info['shape'] = var.shape
except:
pass
if isinstance(var, fluid.framework.Parameter): if isinstance(var, fluid.framework.Parameter):
var_info['trainable'] = var.trainable var_info['trainable'] = var.trainable
var_info['optimize_attr'] = var.optimize_attr var_info['optimize_attr'] = var.optimize_attr
...@@ -153,17 +159,34 @@ def _copy_vars_and_ops_in_blocks(from_block, to_block): ...@@ -153,17 +159,34 @@ def _copy_vars_and_ops_in_blocks(from_block, to_block):
to_block.create_var(**var_info) to_block.create_var(**var_info)
for op in from_block.ops: for op in from_block.ops:
all_attrs = op.all_attrs()
if 'sub_block' in all_attrs:
_sub_block = to_block.program._create_block()
_copy_vars_and_ops_in_blocks(all_attrs['sub_block'], _sub_block)
to_block.program._rollback()
new_attrs = {'sub_block': _sub_block}
for key, value in all_attrs.items():
if key == 'sub_block':
continue
new_attrs[key] = copy.deepcopy(value)
else:
new_attrs = copy.deepcopy(all_attrs)
op_info = { op_info = {
'type': op.type, 'type': op.type,
'inputs': { 'inputs': {
input: [to_block.var(var) for var in op.input(input)] input:
[to_block._find_var_recursive(var) for var in op.input(input)]
for input in op.input_names for input in op.input_names
}, },
'outputs': { 'outputs': {
output: [to_block.var(var) for var in op.output(output)] output: [
to_block._find_var_recursive(var)
for var in op.output(output)
]
for output in op.output_names for output in op.output_names
}, },
'attrs': copy.deepcopy(op.all_attrs()) 'attrs': new_attrs
} }
to_block.append_op(**op_info) to_block.append_op(**op_info)
......
...@@ -30,6 +30,7 @@ from .cmrc2018 import CMRC2018 ...@@ -30,6 +30,7 @@ from .cmrc2018 import CMRC2018
from .bq import BQ from .bq import BQ
from .iflytek import IFLYTEK from .iflytek import IFLYTEK
from .thucnews import THUCNEWS from .thucnews import THUCNEWS
from .couplet import Couplet
# CV Dataset # CV Dataset
from .dogcat import DogCatDataset as DogCat from .dogcat import DogCatDataset as DogCat
......
...@@ -68,7 +68,8 @@ class BaseNLPDataset(BaseDataset): ...@@ -68,7 +68,8 @@ class BaseNLPDataset(BaseDataset):
if not self.tokenizer or not examples: if not self.tokenizer or not examples:
return [] return []
logger.info("Processing the train set...") logger.info("Processing the train set...")
self._train_records = self._convert_examples_to_records(examples) self._train_records = self._convert_examples_to_records(
examples, phase="train")
return self._train_records return self._train_records
@property @property
...@@ -78,7 +79,8 @@ class BaseNLPDataset(BaseDataset): ...@@ -78,7 +79,8 @@ class BaseNLPDataset(BaseDataset):
if not self.tokenizer or not examples: if not self.tokenizer or not examples:
return [] return []
logger.info("Processing the dev set...") logger.info("Processing the dev set...")
self._dev_records = self._convert_examples_to_records(examples) self._dev_records = self._convert_examples_to_records(
examples, phase="dev")
return self._dev_records return self._dev_records
@property @property
...@@ -88,7 +90,8 @@ class BaseNLPDataset(BaseDataset): ...@@ -88,7 +90,8 @@ class BaseNLPDataset(BaseDataset):
if not self.tokenizer or not examples: if not self.tokenizer or not examples:
return [] return []
logger.info("Processing the test set...") logger.info("Processing the test set...")
self._test_records = self._convert_examples_to_records(examples) self._test_records = self._convert_examples_to_records(
examples, phase="test")
return self._test_records return self._test_records
@property @property
...@@ -98,7 +101,8 @@ class BaseNLPDataset(BaseDataset): ...@@ -98,7 +101,8 @@ class BaseNLPDataset(BaseDataset):
if not self.tokenizer or not examples: if not self.tokenizer or not examples:
return [] return []
logger.info("Processing the predict set...") logger.info("Processing the predict set...")
self._predict_records = self._convert_examples_to_records(examples) self._predict_records = self._convert_examples_to_records(
examples, phase="predict")
return self._predict_records return self._predict_records
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -148,12 +152,14 @@ class BaseNLPDataset(BaseDataset): ...@@ -148,12 +152,14 @@ class BaseNLPDataset(BaseDataset):
examples.append(example) examples.append(example)
return examples return examples
def _convert_examples_to_records(self, examples): def _convert_examples_to_records(self, examples, phase):
""" """
Returns a list[dict] including all the input information what the model need. Returns a list[dict] including all the input information what the model need.
Args: Args:
examples (list): the data example, returned by _read_file. examples (list): the data example, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns: Returns:
a list with all the examples record. a list with all the examples record.
...@@ -166,8 +172,8 @@ class BaseNLPDataset(BaseDataset): ...@@ -166,8 +172,8 @@ class BaseNLPDataset(BaseDataset):
text_pair=example.text_b, text_pair=example.text_b,
max_seq_len=self.max_seq_len) max_seq_len=self.max_seq_len)
if example.label: if example.label:
record["label"] = self.label_list.index( record["label"] = self.label_index[
example.label) if self.label_list else float(example.label) example.label] if self.label_list else float(example.label)
records.append(record) records.append(record)
return records return records
...@@ -281,12 +287,14 @@ class BaseNLPDataset(BaseDataset): ...@@ -281,12 +287,14 @@ class BaseNLPDataset(BaseDataset):
class TextClassificationDataset(BaseNLPDataset): class TextClassificationDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples): def _convert_examples_to_records(self, examples, phase):
""" """
Returns a list[dict] including all the input information what the model need. Returns a list[dict] including all the input information what the model need.
Args: Args:
examples (list): the data example, returned by _read_file. examples (list): the data example, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns: Returns:
a list with all the examples record. a list with all the examples record.
...@@ -299,18 +307,19 @@ class TextClassificationDataset(BaseNLPDataset): ...@@ -299,18 +307,19 @@ class TextClassificationDataset(BaseNLPDataset):
text_pair=example.text_b, text_pair=example.text_b,
max_seq_len=self.max_seq_len) max_seq_len=self.max_seq_len)
if example.label: if example.label:
record["label"] = self.label_list.index(example.label) record["label"] = self.label_index[example.label]
records.append(record) records.append(record)
return records return records
class RegressionDataset(BaseNLPDataset): class RegressionDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples): def _convert_examples_to_records(self, examples, phase):
""" """
Returns a list[dict] including all the input information what the model need. Returns a list[dict] including all the input information what the model need.
Args: Args:
examples (list): the data example, returned by _read_file. examples (list): the data example, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns: Returns:
a list with all the examples record. a list with all the examples record.
...@@ -328,6 +337,80 @@ class RegressionDataset(BaseNLPDataset): ...@@ -328,6 +337,80 @@ class RegressionDataset(BaseNLPDataset):
return records return records
class GenerationDataset(BaseNLPDataset):
def __init__(self,
base_path,
train_file=None,
dev_file=None,
test_file=None,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False,
tokenizer=None,
max_seq_len=128,
split_char="\002",
start_token="<s>",
end_token="</s>",
unk_token="<unk>"):
self.split_char = split_char
self.start_token = start_token
self.end_token = end_token
self.unk_token = unk_token
super(GenerationDataset, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_header=train_file_with_header,
dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header,
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _convert_examples_to_records(self, examples, phase):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data example, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
record = self.tokenizer.encode(
text=example.text_a.split(self.split_char),
text_pair=example.text_b.split(self.split_char)
if example.text_b else None,
max_seq_len=self.max_seq_len)
if example.label:
expand_label = [self.start_token] + example.label.split(
self.split_char)[:self.max_seq_len - 2] + [self.end_token]
expand_label_id = [
self.label_index.get(label,
self.label_index[self.unk_token])
for label in expand_label
]
record["label"] = expand_label_id[1:] + [
self.label_index[self.end_token]
] * (self.max_seq_len - len(expand_label) + 1)
record["dec_input"] = expand_label_id[:-1] + [
self.label_index[self.end_token]
] * (self.max_seq_len - len(expand_label) + 1)
records.append(record)
return records
class SeqLabelingDataset(BaseNLPDataset): class SeqLabelingDataset(BaseNLPDataset):
def __init__(self, def __init__(self,
base_path, base_path,
...@@ -363,12 +446,13 @@ class SeqLabelingDataset(BaseNLPDataset): ...@@ -363,12 +446,13 @@ class SeqLabelingDataset(BaseNLPDataset):
tokenizer=tokenizer, tokenizer=tokenizer,
max_seq_len=max_seq_len) max_seq_len=max_seq_len)
def _convert_examples_to_records(self, examples): def _convert_examples_to_records(self, examples, phase):
""" """
Returns a list[dict] including all the input information what the model need. Returns a list[dict] including all the input information what the model need.
Args: Args:
examples (list): the data examples, returned by _read_file. examples (list): the data examples, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns: Returns:
a list with all the examples record. a list with all the examples record.
...@@ -389,11 +473,11 @@ class SeqLabelingDataset(BaseNLPDataset): ...@@ -389,11 +473,11 @@ class SeqLabelingDataset(BaseNLPDataset):
if tokens_index < len( if tokens_index < len(
tokens) and token == tokens[tokens_index]: tokens) and token == tokens[tokens_index]:
record["label"].append( record["label"].append(
self.label_list.index(labels[tokens_index])) self.label_index[labels[tokens_index]])
tokens_index += 1 tokens_index += 1
else: else:
record["label"].append( record["label"].append(
self.label_list.index(self.no_entity_label)) self.label_index[self.no_entity_label])
records.append(record) records.append(record)
return records return records
...@@ -435,13 +519,13 @@ class SeqLabelingDataset(BaseNLPDataset): ...@@ -435,13 +519,13 @@ class SeqLabelingDataset(BaseNLPDataset):
class MultiLabelDataset(BaseNLPDataset): class MultiLabelDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples): def _convert_examples_to_records(self, examples, phase):
""" """
Returns a list[dict] including all the input information what the model need. Returns a list[dict] including all the input information what the model need.
Args: Args:
examples (list): the data examples, returned by _read_file. examples (list): the data examples, returned by _read_file.
max_seq_len (int): padding to the max sequence length. phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns: Returns:
a list with all the examples record. a list with all the examples record.
...@@ -631,7 +715,16 @@ class MRCDataset(BaseNLPDataset): ...@@ -631,7 +715,16 @@ class MRCDataset(BaseNLPDataset):
return special_tokens_num, special_tokens_num_before_doc return special_tokens_num, special_tokens_num_before_doc
def _convert_examples_to_records_and_features(self, examples, phase): def _convert_examples_to_records_and_features(self, examples, phase):
"""Loads a data file into a list of `InputBatch`s.""" """
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data examples, returned by _read_file.
phase (str): the processing phase, can be "train" "dev" "test" or "predict".
Returns:
a list with all the examples record.
"""
features = [] features = []
records = [] records = []
unique_id = 1000000000 unique_id = 1000000000
......
# coding:utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import codecs
import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import GenerationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/couplet.tar.gz"
class Couplet(GenerationDataset):
"""
An open source couplet dataset, see https://github.com/v-zich/couplet-clean-dataset for details.
"""
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "couplet")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
with open(os.path.join(dataset_dir, "vocab.txt")) as vocab_file:
label_list = [line.strip() for line in vocab_file.readlines()]
super(Couplet, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file="dev.tsv",
test_file="test.tsv",
label_list=label_list,
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with codecs.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
for line in reader:
example = InputExample(
guid=seq_id, label=line[1], text_a=line[0])
seq_id += 1
examples.append(example)
return examples
if __name__ == "__main__":
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = Couplet(tokenizer=tokenizer, max_seq_len=30)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("guid: {}\ttext_a: {}\ttext_b: {}\tlabel: {}".format(
e.guid, e.text_a, e.text_b, e.label))
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("guid: {}\ttext_a: {}\ttext_b: {}\tlabel: {}".format(
e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("guid: {}\ttext_a: {}\ttext_b: {}\tlabel: {}".format(
e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -106,6 +106,9 @@ class BaseDataset(object): ...@@ -106,6 +106,9 @@ class BaseDataset(object):
"As label_list has been assigned, label_file is noneffective" "As label_list has been assigned, label_file is noneffective"
) )
self.label_index = dict(
zip(self.label_list, range(len(self.label_list))))
def get_train_examples(self): def get_train_examples(self):
return self.train_examples return self.train_examples
......
...@@ -12,10 +12,8 @@ ...@@ -12,10 +12,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import collections
from __future__ import absolute_import import math
from __future__ import division
from __future__ import print_function
import numpy as np import numpy as np
...@@ -153,7 +151,7 @@ def matthews_corrcoef(preds, labels): ...@@ -153,7 +151,7 @@ def matthews_corrcoef(preds, labels):
def recall_nk(data, n, k, m): def recall_nk(data, n, k, m):
''' """
This metric can be used to evaluate whether the model can find the correct response B for question A This metric can be used to evaluate whether the model can find the correct response B for question A
Note: Only applies to each question A only has one correct response B1. Note: Only applies to each question A only has one correct response B1.
...@@ -170,7 +168,7 @@ def recall_nk(data, n, k, m): ...@@ -170,7 +168,7 @@ def recall_nk(data, n, k, m):
m: int. For every m examples, there's going to be a positive sample. m: int. For every m examples, there's going to be a positive sample.
eg. data= [A1,B1,1], [A1,B2,0], [A1,B3,0], [A2,B1,1], [A2,B2,0], [A2,B3,0] eg. data= [A1,B1,1], [A1,B2,0], [A1,B3,0], [A2,B1,1], [A2,B2,0], [A2,B3,0]
For every 3 examples, there will be one positive sample. so m=3, and n can be 1,2 or 3. For every 3 examples, there will be one positive sample. so m=3, and n can be 1,2 or 3.
''' """
def get_p_at_n_in_m(data, n, k, ind): def get_p_at_n_in_m(data, n, k, ind):
""" """
...@@ -194,3 +192,94 @@ def recall_nk(data, n, k, m): ...@@ -194,3 +192,94 @@ def recall_nk(data, n, k, m):
correct_num += get_p_at_n_in_m(data, n, k, ind) correct_num += get_p_at_n_in_m(data, n, k, ind)
return correct_num / length return correct_num / length
def _get_ngrams(segment, max_order):
"""
Extracts all n-grams upto a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i:i + order])
ngram_counts[ngram] += 1
return ngram_counts
def compute_bleu(reference_corpus,
translation_corpus,
max_order=4,
smooth=False):
"""
Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (reference, translation) in zip(reference_corpus, translation_corpus):
reference_length += len(reference)
translation_length += len(translation)
merged_ref_ngram_counts = collections.Counter()
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram) - 1] += overlap[ngram]
for order in range(1, max_order + 1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order - 1] += possible_matches
precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = ((matches_by_order[i] + 1.) /
(possible_matches_by_order[i] + 1.))
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (
float(matches_by_order[i]) / possible_matches_by_order[i])
else:
precisions[i] = 0.0
if min(precisions) > 0:
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
geo_mean = 0
ratio = float(translation_length) / reference_length
if ratio > 1.0:
bp = 1.
elif ratio > 0.0:
bp = math.exp(1 - 1. / ratio)
else:
bp = 0
bleu = geo_mean * bp
return (bleu, precisions, bp, ratio, translation_length, reference_length)
...@@ -19,3 +19,4 @@ from .detection_task import DetectionTask ...@@ -19,3 +19,4 @@ from .detection_task import DetectionTask
from .reading_comprehension_task import ReadingComprehensionTask from .reading_comprehension_task import ReadingComprehensionTask
from .regression_task import RegressionTask from .regression_task import RegressionTask
from .sequence_task import SequenceLabelTask from .sequence_task import SequenceLabelTask
from .generation_task import TextGenerationTask
...@@ -344,6 +344,8 @@ class BaseTask(object): ...@@ -344,6 +344,8 @@ class BaseTask(object):
self.dataset = dataset self.dataset = dataset
if dataset: if dataset:
self._label_list = dataset.get_labels() self._label_list = dataset.get_labels()
else:
self._label_list = None
# Compatible code for usage deprecated in paddlehub v1.8. # Compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader = data_reader self._base_data_reader = data_reader
self._base_feed_list = feed_list self._base_feed_list = feed_list
...@@ -1099,23 +1101,29 @@ class BaseTask(object): ...@@ -1099,23 +1101,29 @@ class BaseTask(object):
or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8). or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
label_list (list): the label list, used to proprocess the output. label_list (list): the label list, used to proprocess the output.
load_best_model (bool): load the best model or not load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param. return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader but dataset parameter.
accelerate_mode (bool): use high-performance predictor or not accelerate_mode (bool): use high-performance predictor or not
Returns: Returns:
RunState: the running result of predict phase RunState: the running result of predict phase
""" """
if accelerate_mode and isinstance(self._base_data_reader, if accelerate_mode:
hub.reader.LACClassifyReader): if isinstance(self._base_data_reader, hub.reader.LACClassifyReader):
logger.warning( logger.warning(
"LACClassifyReader does not support predictor, the accelerate_mode is closed now." "LACClassifyReader does not support predictor, the accelerate_mode is closed now."
) )
accelerate_mode = False accelerate_mode = False
elif isinstance(self, hub.TextGenerationTask):
logger.warning(
"TextGenerationTask does not support predictor, the accelerate_mode is closed now."
)
accelerate_mode = False
self.accelerate_mode = accelerate_mode self.accelerate_mode = accelerate_mode
with self.phase_guard(phase="predict"): with self.phase_guard(phase="predict"):
self._predict_data = data self._predict_data = data
self._label_list = label_list if label_list:
self._label_list = label_list
self._predict_start_event() self._predict_start_event()
if load_best_model: if load_best_model:
......
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from paddle.fluid import ParamAttr
from paddle.fluid.layers import RNNCell, LSTMCell, rnn, BeamSearchDecoder, dynamic_decode
from paddlehub.finetune.evaluate import compute_bleu
from .base_task import BaseTask
class AttentionDecoderCell(RNNCell):
def __init__(self, num_layers, hidden_size, dropout_prob=0.,
init_scale=0.1):
self.num_layers = num_layers
self.hidden_size = hidden_size
self.dropout_prob = dropout_prob
self.lstm_cells = []
self.init_scale = init_scale
param_attr = ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0))
for i in range(num_layers):
self.lstm_cells.append(LSTMCell(hidden_size, param_attr, bias_attr))
def attention(self, query, enc_output, mask=None):
query = fluid.layers.unsqueeze(query, [1])
memory = fluid.layers.fc(
enc_output,
self.hidden_size,
num_flatten_dims=2,
param_attr=ParamAttr(
name="dec_memory_w",
initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale)))
attn = fluid.layers.matmul(query, memory, transpose_y=True)
if mask:
attn = fluid.layers.transpose(attn, [1, 0, 2])
attn = fluid.layers.elementwise_add(attn, mask * 1000000000, -1)
attn = fluid.layers.transpose(attn, [1, 0, 2])
weight = fluid.layers.softmax(attn)
weight_memory = fluid.layers.matmul(weight, memory)
return weight_memory
def call(self, step_input, states, enc_output, enc_padding_mask=None):
lstm_states, input_feed = states
new_lstm_states = []
step_input = fluid.layers.concat([step_input, input_feed], 1)
for i in range(self.num_layers):
out, new_lstm_state = self.lstm_cells[i](step_input, lstm_states[i])
step_input = fluid.layers.dropout(
out,
self.dropout_prob,
dropout_implementation='upscale_in_train'
) if self.dropout_prob > 0 else out
new_lstm_states.append(new_lstm_state)
dec_att = self.attention(step_input, enc_output, enc_padding_mask)
dec_att = fluid.layers.squeeze(dec_att, [1])
concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
out = fluid.layers.fc(
concat_att_out,
self.hidden_size,
param_attr=ParamAttr(
name="dec_out_w",
initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale)))
return out, [new_lstm_states, out]
class TextGenerationTask(BaseTask):
"""
TextGenerationTask use rnn as decoder and beam search technology when predict.
Args:
feature(Variable): The sentence-level feature, shape as [-1, emb_size].
token_feature(Variable): The token-level feature, shape as [-1, seq_len, emb_size].
max_seq_len(int): the decoder max sequence length.
num_classes(int): total labels of the task.
dataset(GenerationDataset): the dataset containing training set, development set and so on. If you want to finetune the model, you should set it.
Otherwise, if you just want to use the model to predict, you can omit it. Default None
num_layers(int): the decoder rnn layers number. Default 1
hidden_size(int): the decoder rnn hidden size. Default 128
dropout(float): the decoder dropout rate. Default 0.
beam_size(int): the beam search size during predict phase. Default 10.
beam_max_step_num(int): the beam search max step number. Default 30.
start_token(str): the beam search start token. Default "<s>"
end_token(str): the beam search end token. Default "</s>"
startup_program(Program): the customized startup_program, default None
config(RunConfig): the config for the task, default None
metrics_choices(list): metrics used to the task, default ["bleu"]
"""
def __init__(
self,
feature,
token_feature,
max_seq_len,
num_classes,
dataset=None,
num_layers=1,
hidden_size=512,
dropout=0.,
beam_size=10,
beam_max_step_num=30,
start_token="<s>",
end_token="</s>",
startup_program=None,
config=None,
metrics_choices="default",
):
if metrics_choices == "default":
metrics_choices = ["bleu"]
main_program = feature.block.program
super(TextGenerationTask, self).__init__(
dataset=dataset,
main_program=main_program,
startup_program=startup_program,
config=config,
metrics_choices=metrics_choices)
self.num_layers = num_layers
self.hidden_size = hidden_size
self.dropout = dropout
self.token_feature = token_feature
self.feature = feature
self.max_seq_len = max_seq_len
self.num_classes = num_classes
self.beam_size = beam_size
self.beam_max_step_num = beam_max_step_num
self.start_token = start_token
self.end_token = end_token
def _add_label(self):
label = fluid.layers.data(
name="label", shape=[self.max_seq_len, 1], dtype='int64')
return [label]
def _build_net(self):
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0)
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
src_mask = fluid.layers.sequence_mask(
self.seq_len_used, maxlen=self.max_seq_len, dtype='float32')
enc_padding_mask = (src_mask - 1.0)
# Define decoder and initialize it.
dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size,
self.dropout)
enc_last_step = fluid.layers.slice(
self.token_feature,
axes=[1],
starts=[-1],
ends=[self.token_feature.shape[1] + 1])
dec_init_cell = fluid.layers.fc(
input=enc_last_step,
size=self.hidden_size,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
name="dec_init_cell_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="dec_init_cell_b",
initializer=fluid.initializer.Constant(0.)))
dec_init_hidden = fluid.layers.fc(
input=self.feature,
size=self.hidden_size,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
name="dec_init_hidden_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="dec_init_hidden_b",
initializer=fluid.initializer.Constant(0.)))
# TODO: maybe dec_init_hidden can use self.feature, and dec_init_cell can be get_initial_states
dec_initial_states = [
[[dec_init_hidden, dec_init_cell]] * self.num_layers,
dec_cell.get_initial_states(
batch_ref=self.token_feature, shape=[self.hidden_size])
]
tar_vocab_size = len(self._label_list)
tar_embeder = lambda x: fluid.embedding(
input=x,
size=[tar_vocab_size, self.hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
name='target_embedding',
initializer=fluid.initializer.UniformInitializer(
low=-0.1, high=0.1)))
start_token_id = self._label_list.index(self.start_token)
end_token_id = self._label_list.index(self.end_token)
if not self.is_predict_phase:
self.dec_input = fluid.layers.data(
name="dec_input", shape=[self.max_seq_len], dtype='int64')
tar_emb = tar_embeder(self.dec_input)
dec_output, _ = rnn(
cell=dec_cell,
inputs=tar_emb,
initial_states=dec_initial_states,
sequence_length=None,
enc_output=self.token_feature,
enc_padding_mask=enc_padding_mask)
self.logits = fluid.layers.fc(
dec_output,
size=tar_vocab_size,
num_flatten_dims=len(dec_output.shape) - 1,
param_attr=fluid.ParamAttr(
name="output_w",
initializer=fluid.initializer.UniformInitializer(
low=-0.1, high=0.1)))
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
logits = self.logits
logits = fluid.layers.softmax(logits)
return [logits]
else:
output_layer = lambda x: fluid.layers.fc(
x,
size=tar_vocab_size,
num_flatten_dims=len(x.shape) - 1,
param_attr=fluid.ParamAttr(name="output_w"))
beam_search_decoder = BeamSearchDecoder(
dec_cell,
start_token_id,
end_token_id,
self.beam_size,
embedding_fn=tar_embeder,
output_fn=output_layer)
enc_output = beam_search_decoder.tile_beam_merge_with_batch(
self.token_feature, self.beam_size)
enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch(
enc_padding_mask, self.beam_size)
self.ret_infers, _ = dynamic_decode(
beam_search_decoder,
inits=dec_initial_states,
max_step_num=self.beam_max_step_num,
enc_output=enc_output,
enc_padding_mask=enc_padding_mask)
return self.ret_infers
def _postprocessing(self, run_states):
results = []
for batch_states in run_states:
batch_results = batch_states.run_results
batch_infers = batch_results[0].astype(np.int32)
seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
for i, sample_infers in enumerate(batch_infers):
beam_result = []
for beam_infer in sample_infers.T:
seq_result = [
self._label_list[infer]
for infer in beam_infer.tolist()[:seq_lens[i] - 2]
]
beam_result.append(seq_result)
results.append(beam_result)
return results
def _add_metrics(self):
self.ret_labels = fluid.layers.reshape(x=self.labels[0], shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len_used]
def _add_loss(self):
loss = fluid.layers.cross_entropy(
input=self.outputs[0], label=self.labels[0], soft_label=False)
loss = fluid.layers.unsqueeze(loss, axes=[2])
max_tar_seq_len = fluid.layers.shape(self.dec_input)[1]
tar_sequence_length = fluid.layers.elementwise_sub(
self.seq_len_used, fluid.layers.ones_like(self.seq_len_used))
tar_mask = fluid.layers.sequence_mask(
tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32')
loss = loss * tar_mask
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
return loss
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len_used.name]
return [output.name for output in self.outputs]
def _calculate_metrics(self, run_states):
loss_sum = 0
run_step = run_examples = 0
labels = []
results = []
for run_state in run_states:
loss_sum += np.mean(run_state.run_results[-1])
np_labels = run_state.run_results[0]
np_infers = run_state.run_results[1]
np_lens = run_state.run_results[2]
batch_size = len(np_lens)
max_len = len(np_labels) // batch_size
for i in range(batch_size):
label = [
self.dataset.label_list[int(id)]
for id in np_labels[i * max_len:i * max_len + np_lens[i] -
2]
] # -2 for CLS and SEP
result = [
self.dataset.label_list[int(id)]
for id in np_infers[i * max_len:i * max_len + np_lens[i] -
2]
]
labels.append(label)
results.append(result)
run_examples += run_state.run_examples
run_step += run_state.run_step
run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used
avg_loss = loss_sum / run_examples
# The first key will be used as main metrics to update the best model
scores = OrderedDict()
for metric in self.metrics_choices:
if metric == "bleu":
scores["bleu"] = compute_bleu(labels, results, max_order=1)[0]
else:
raise ValueError("Not Support Metric: \"%s\"" % metric)
return scores, avg_loss, run_speed
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册