From 386f2d13a0e2a78cd74a4d9005e9ea2a74dd1a0e Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 19 May 2020 19:36:11 +0800 Subject: [PATCH] Make Transformer, seq2seq, sequence_tagging adapt to paddle.incubate.hapi --- examples/seq2seq/seq2seq_attn.py | 3 +- examples/seq2seq/seq2seq_base.py | 3 +- examples/seq2seq/train.py | 5 +- examples/seq2seq/utility.py | 21 +- examples/sequence_tagging/eval.py | 25 +- examples/sequence_tagging/predict.py | 29 +- examples/sequence_tagging/reader.py | 249 ++++++++++++ examples/sequence_tagging/sequence_tagging.py | 196 ++++++++++ examples/sequence_tagging/train.py | 37 +- examples/sequence_tagging/utils/__init__.py | 0 examples/sequence_tagging/utils/check.py | 58 +++ examples/sequence_tagging/utils/configure.py | 356 ++++++++++++++++++ examples/sequence_tagging/utils/metrics.py | 75 ++++ examples/transformer/transformer.py | 31 +- 14 files changed, 996 insertions(+), 92 deletions(-) create mode 100644 examples/sequence_tagging/reader.py create mode 100644 examples/sequence_tagging/sequence_tagging.py create mode 100644 examples/sequence_tagging/utils/__init__.py create mode 100644 examples/sequence_tagging/utils/check.py create mode 100644 examples/sequence_tagging/utils/configure.py create mode 100644 examples/sequence_tagging/utils/metrics.py diff --git a/examples/seq2seq/seq2seq_attn.py b/examples/seq2seq/seq2seq_attn.py index 37c84cb..0979d68 100644 --- a/examples/seq2seq/seq2seq_attn.py +++ b/examples/seq2seq/seq2seq_attn.py @@ -19,7 +19,8 @@ from paddle.fluid.initializer import UniformInitializer from paddle.fluid.dygraph import Embedding, Linear, Layer from paddle.fluid.layers import BeamSearchDecoder -from paddle.incubate.hapi.model import Model, Loss +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.loss import Loss from paddle.incubate.hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell from seq2seq_base import Encoder diff --git a/examples/seq2seq/seq2seq_base.py b/examples/seq2seq/seq2seq_base.py index 6342ca1..8d8b5b6 100644 --- a/examples/seq2seq/seq2seq_base.py +++ b/examples/seq2seq/seq2seq_base.py @@ -19,7 +19,8 @@ from paddle.fluid.initializer import UniformInitializer from paddle.fluid.dygraph import Embedding, Linear, Layer from paddle.fluid.layers import BeamSearchDecoder -from paddle.incubate.hapi.model import Model, Loss +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.loss import Loss from paddle.incubate.hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell diff --git a/examples/seq2seq/train.py b/examples/seq2seq/train.py index cd4fab9..b065ff5 100644 --- a/examples/seq2seq/train.py +++ b/examples/seq2seq/train.py @@ -26,7 +26,7 @@ from args import parse_args from seq2seq_base import BaseModel, CrossEntropyCriterion from seq2seq_attn import AttentionModel from reader import create_data_loader -from utility import PPL, TrainCallback +from utility import PPL, TrainCallback, get_model_cls def do_train(args): @@ -56,7 +56,8 @@ def do_train(args): # def dataloader train_loader, eval_loader = create_data_loader(args, device) - model_maker = AttentionModel if args.attention else BaseModel + model_maker = get_model_cls( + AttentionModel) if args.attention else get_model_cls(BaseModel) model = model_maker(args.src_vocab_size, args.tar_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout) diff --git a/examples/seq2seq/utility.py b/examples/seq2seq/utility.py index 95a38ff..b5ce128 100644 --- a/examples/seq2seq/utility.py +++ b/examples/seq2seq/utility.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid from paddle.incubate.hapi.metrics import Metric from paddle.incubate.hapi.callbacks import ProgBarLogger +from paddle.incubate.hapi.text import BasicLSTMCell class TrainCallback(ProgBarLogger): @@ -77,4 +78,22 @@ class PPL(Metric): def cal_acc_ppl(self, batch_loss, batch_size): self.total_loss += batch_loss * batch_size ppl = math.exp(self.total_loss / self.word_count) - return ppl \ No newline at end of file + return ppl + + +def get_model_cls(model_cls): + """ + Patch for BasicLSTMCell to make `_forget_bias.stop_gradient=True` + Remove this workaround when BasicLSTMCell or recurrent_op is fixed. + """ + + def __lstm_patch__(self, *args, **kwargs): + self._raw_init(*args, **kwargs) + layers = self.sublayers(include_sublayers=True) + for layer in layers: + if isinstance(layer, BasicLSTMCell): + layer._forget_bias.stop_gradient = False + + model_cls._raw_init = model_cls.__init__ + model_cls.__init__ = __lstm_patch__ + return model_cls diff --git a/examples/sequence_tagging/eval.py b/examples/sequence_tagging/eval.py index 2520d95..4163ce7 100644 --- a/examples/sequence_tagging/eval.py +++ b/examples/sequence_tagging/eval.py @@ -18,24 +18,14 @@ SequenceTagging eval structure from __future__ import division from __future__ import print_function -import io -import os -import sys -import math -import argparse -import numpy as np - -work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(os.path.join(work_dir, "../")) - -from paddle.incubate.hapi.model import set_device, Input -from paddle.incubate.hapi.text.sequence_tagging import SeqTagging, ChunkEval, LacLoss -from paddle.incubate.hapi.text.sequence_tagging import LacDataset, LacDataLoader -from paddle.incubate.hapi.text.sequence_tagging import check_gpu, check_version -from paddle.incubate.hapi.text.sequence_tagging import PDConfig - import paddle.fluid as fluid from paddle.fluid.layers.utils import flatten +from paddle.incubate.hapi.model import Input, set_device + +from sequence_tagging import SeqTagging, LacLoss, ChunkEval +from reader import LacDataset, LacDataLoader +from utils.check import check_gpu, check_version +from utils.configure import PDConfig def main(args): @@ -79,5 +69,6 @@ if __name__ == '__main__': use_gpu = True if args.device == "gpu" else False check_gpu(use_gpu) - check_version() + # TODO: add check for 2.0.0-alpha0 if fluid.require_version support + # check_version() main(args) diff --git a/examples/sequence_tagging/predict.py b/examples/sequence_tagging/predict.py index 5b1620d..583b41c 100644 --- a/examples/sequence_tagging/predict.py +++ b/examples/sequence_tagging/predict.py @@ -18,25 +18,16 @@ SequenceTagging predict structure from __future__ import division from __future__ import print_function -import io -import os -import sys import six -import math -import argparse -import numpy as np - -work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(os.path.join(work_dir, "../")) - -from paddle.incubate.hapi.text.sequence_tagging import SeqTagging -from paddle.incubate.hapi.model import Input, set_device -from paddle.incubate.hapi.text.sequence_tagging import LacDataset, LacDataLoader -from paddle.incubate.hapi.text.sequence_tagging import check_gpu, check_version -from paddle.incubate.hapi.text.sequence_tagging import PDConfig import paddle.fluid as fluid from paddle.fluid.layers.utils import flatten +from paddle.incubate.hapi.model import Input, set_device + +from sequence_tagging import SeqTagging, LacLoss, ChunkEval +from reader import LacDataset, LacDataLoader +from utils.check import check_gpu, check_version +from utils.configure import PDConfig def main(args): @@ -45,8 +36,9 @@ def main(args): inputs = [ Input( - [None, None], 'int64', name='words'), Input( - [None], 'int64', name='length') + [None, None], 'int64', name='words'), + Input( + [None], 'int64', name='length'), ] dataset = LacDataset(args) @@ -87,5 +79,6 @@ if __name__ == '__main__': use_gpu = True if args.device == "gpu" else False check_gpu(use_gpu) - check_version() + # TODO: add check for 2.0.0-alpha0 if fluid.require_version support + # check_version() main(args) diff --git a/examples/sequence_tagging/reader.py b/examples/sequence_tagging/reader.py new file mode 100644 index 0000000..4831c7e --- /dev/null +++ b/examples/sequence_tagging/reader.py @@ -0,0 +1,249 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SequenceTagging dataset +""" + +from __future__ import division +from __future__ import print_function + +import io +import os +import numpy as np +import shutil +from functools import partial + +import paddle +from paddle.io import BatchSampler, DataLoader, Dataset +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.incubate.hapi.distributed import DistributedBatchSampler + + +class LacDataset(Dataset): + """ + Load lexical analysis dataset + """ + + def __init__(self, args): + self.word_dict_path = args.word_dict_path + self.label_dict_path = args.label_dict_path + self.word_rep_dict_path = args.word_rep_dict_path + self._load_dict() + self.examples = [] + + def _load_dict(self): + self.word2id_dict = self.load_kv_dict( + self.word_dict_path, reverse=True, value_func=np.int64) + self.id2word_dict = self.load_kv_dict(self.word_dict_path) + self.label2id_dict = self.load_kv_dict( + self.label_dict_path, reverse=True, value_func=np.int64) + self.id2label_dict = self.load_kv_dict(self.label_dict_path) + if self.word_rep_dict_path is None: + self.word_replace_dict = dict() + else: + self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path) + + def load_kv_dict(self, + dict_path, + reverse=False, + delimiter="\t", + key_func=None, + value_func=None): + """ + Load key-value dict from file + """ + result_dict = {} + for line in io.open(dict_path, "r", encoding='utf8'): + terms = line.strip("\n").split(delimiter) + if len(terms) != 2: + continue + if reverse: + value, key = terms + else: + key, value = terms + if key in result_dict: + raise KeyError("key duplicated with [%s]" % (key)) + if key_func: + key = key_func(key) + if value_func: + value = value_func(value) + result_dict[key] = value + return result_dict + + @property + def vocab_size(self): + return max(self.word2id_dict.values()) + 1 + + @property + def num_labels(self): + return max(self.label2id_dict.values()) + 1 + + def get_num_examples(self, filename): + """num of line of file""" + return sum(1 for line in io.open(filename, "r", encoding='utf8')) + + def word_to_ids(self, words): + """convert word to word index""" + word_ids = [] + for word in words: + word = self.word_replace_dict.get(word, word) + if word not in self.word2id_dict: + word = "OOV" + word_id = self.word2id_dict[word] + word_ids.append(word_id) + + return word_ids + + def label_to_ids(self, labels): + """convert label to label index""" + label_ids = [] + for label in labels: + if label not in self.label2id_dict: + label = "O" + label_id = self.label2id_dict[label] + label_ids.append(label_id) + return label_ids + + def file_reader(self, filename, phase="train"): + """ + yield (word_idx, target_idx) one by one from file, + or yield (word_idx, ) in `infer` mode + """ + self.phase = phase + with io.open(filename, "r", encoding="utf8") as fr: + if phase in ["train", "test"]: + headline = next(fr) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[ + 0] == "text_a" and headline[1] == "label" + + for line in fr: + line_str = line.strip("\n") + if len(line_str) < 1 and len(line_str.split('\t')) < 2: + continue + + self.examples.append(line_str) + else: + for idx, line in enumerate(fr): + words = line.strip("\n").split("\t")[0] + self.examples.append(words) + + def __getitem__(self, idx): + line_str = self.examples[idx] + if self.phase in ["train", "test"]: + words, labels = line_str.split('\t') + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + return word_ids, label_ids + else: + words = [w for w in line_str] + word_ids = self.word_to_ids(words) + return word_ids + + def __len__(self): + + return len(self.examples) + + +def create_lexnet_data_generator(args, insts, phase="train"): + def padding_data(max_len, batch_data, if_len=False): + padding_batch_data = [] + padding_lens = [] + for data in batch_data: + data = data[:max_len] + if if_len: + seq_len = np.int64(len(data)) + padding_lens.append(seq_len) + data += [0 for _ in range(max_len - len(data))] + padding_batch_data.append(data) + if if_len: + return np.array(padding_batch_data), np.array(padding_lens) + else: + return np.array(padding_batch_data) + + if phase == "train": + batch_words = [inst[0] for inst in insts] + batch_labels = [inst[1] for inst in insts] + padding_batch_words, padding_lens = padding_data( + args.max_seq_len, batch_words, if_len=True) + padding_batch_labels = padding_data(args.max_seq_len, batch_labels) + return [ + padding_batch_words, padding_lens, padding_batch_labels, + padding_batch_labels + ] + elif phase == "test": + batch_words = [inst[0] for inst in insts] + seq_len = [len(inst[0]) for inst in insts] + max_seq_len = max(seq_len) + batch_labels = [inst[1] for inst in insts] + padding_batch_words, padding_lens = padding_data( + max_seq_len, batch_words, if_len=True) + padding_batch_labels = padding_data(max_seq_len, batch_labels) + return [ + padding_batch_words, padding_lens, padding_batch_labels, + padding_batch_labels + ] + else: + batch_words = insts + seq_len = [len(inst) for inst in insts] + max_seq_len = max(seq_len) + padding_batch_words, padding_lens = padding_data( + max_seq_len, batch_words, if_len=True) + return [padding_batch_words, padding_lens] + + +class LacDataLoader(object): + def __init__(self, + args, + place, + phase="train", + shuffle=False, + num_workers=0, + drop_last=False): + assert phase in [ + "train", "test", "predict" + ], "phase should be in [train, test, predict], but get %s" % phase + + if phase == "train": + file_name = args.train_file + elif phase == "test": + file_name = args.test_file + elif phase == "predict": + file_name = args.predict_file + + self.dataset = LacDataset(args) + self.dataset.file_reader(file_name, phase=phase) + + if phase == "train": + self.sampler = DistributedBatchSampler( + dataset=self.dataset, + batch_size=args.batch_size, + shuffle=shuffle, + drop_last=drop_last) + else: + self.sampler = BatchSampler( + dataset=self.dataset, + batch_size=args.batch_size, + shuffle=shuffle, + drop_last=drop_last) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_sampler=self.sampler, + places=place, + collate_fn=partial( + create_lexnet_data_generator, args, phase=phase), + num_workers=num_workers, + return_list=True) diff --git a/examples/sequence_tagging/sequence_tagging.py b/examples/sequence_tagging/sequence_tagging.py new file mode 100644 index 0000000..3392a61 --- /dev/null +++ b/examples/sequence_tagging/sequence_tagging.py @@ -0,0 +1,196 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SequenceTagging network structure +""" + +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import math +import argparse +import numpy as np + +import paddle.fluid as fluid +from paddle.incubate.hapi.metrics import Metric +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.loss import Loss +from paddle.incubate.hapi.text import SequenceTagging + +from utils.check import check_gpu, check_version +from utils.configure import PDConfig + + +class SeqTagging(Model): + def __init__(self, args, vocab_size, num_labels, length=None, + mode="train"): + super(SeqTagging, self).__init__() + """ + define the lexical analysis network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + self.mode_type = mode + self.word_emb_dim = args.word_emb_dim + self.vocab_size = vocab_size + self.num_labels = num_labels + self.grnn_hidden_dim = args.grnn_hidden_dim + self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( + args) else 1.0 + self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( + args) else 1.0 + self.bigru_num = args.bigru_num + self.batch_size = args.batch_size + self.init_bound = 0.1 + self.length = length + + self.sequence_tagging = SequenceTagging( + vocab_size=self.vocab_size, + num_labels=self.num_labels, + word_emb_dim=self.word_emb_dim, + grnn_hidden_dim=self.grnn_hidden_dim, + emb_learning_rate=self.emb_lr, + crf_learning_rate=self.crf_lr, + bigru_num=self.bigru_num, + init_bound=self.init_bound) + + def forward(self, *inputs): + """ + Configure the network + """ + word = inputs[0] + lengths = inputs[1] + if self.mode_type == "train" or self.mode_type == "test": + target = inputs[2] + outputs = self.sequence_tagging(word, lengths, target) + else: + outputs = self.sequence_tagging(word, lengths) + return outputs + + +class Chunk_eval(fluid.dygraph.Layer): + def __init__(self, + num_chunk_types, + chunk_scheme, + excluded_chunk_types=None): + super(Chunk_eval, self).__init__() + self.num_chunk_types = num_chunk_types + self.chunk_scheme = chunk_scheme + self.excluded_chunk_types = excluded_chunk_types + + def forward(self, input, label, seq_length=None): + precision = self._helper.create_variable_for_type_inference( + dtype="float32") + recall = self._helper.create_variable_for_type_inference( + dtype="float32") + f1_score = self._helper.create_variable_for_type_inference( + dtype="float32") + num_infer_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_label_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_correct_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + this_input = {"Inference": input, "Label": label} + if seq_length is not None: + this_input["SeqLength"] = seq_length + self._helper.append_op( + type='chunk_eval', + inputs=this_input, + outputs={ + "Precision": [precision], + "Recall": [recall], + "F1-Score": [f1_score], + "NumInferChunks": [num_infer_chunks], + "NumLabelChunks": [num_label_chunks], + "NumCorrectChunks": [num_correct_chunks] + }, + attrs={ + "num_chunk_types": self.num_chunk_types, + "chunk_scheme": self.chunk_scheme, + "excluded_chunk_types": self.excluded_chunk_types or [] + }) + return (num_infer_chunks, num_label_chunks, num_correct_chunks) + + +class LacLoss(Loss): + def __init__(self): + super(LacLoss, self).__init__() + pass + + def forward(self, outputs, labels): + avg_cost = outputs[1] + return avg_cost + + +class ChunkEval(Metric): + def __init__(self, num_labels, name=None, *args, **kwargs): + super(ChunkEval, self).__init__(*args, **kwargs) + self._init_name(name) + self.chunk_eval = Chunk_eval( + int(math.ceil((num_labels - 1) / 2.0)), "IOB") + self.reset() + + def add_metric_op(self, *args): + crf_decode = args[0] + lengths = args[2] + label = args[3] + (num_infer_chunks, num_label_chunks, + num_correct_chunks) = self.chunk_eval( + input=crf_decode, label=label, seq_length=lengths) + return [num_infer_chunks, num_label_chunks, num_correct_chunks] + + def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, + *args, **kwargs): + self.infer_chunks_total += num_infer_chunks + self.label_chunks_total += num_label_chunks + self.correct_chunks_total += num_correct_chunks + precision = float( + num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 + recall = float( + num_correct_chunks) / num_label_chunks if num_label_chunks else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if num_correct_chunks else 0 + return [precision, recall, f1_score] + + def reset(self): + self.infer_chunks_total = 0 + self.label_chunks_total = 0 + self.correct_chunks_total = 0 + + def accumulate(self): + precision = float( + self.correct_chunks_total + ) / self.infer_chunks_total if self.infer_chunks_total else 0 + recall = float( + self.correct_chunks_total + ) / self.label_chunks_total if self.label_chunks_total else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if self.correct_chunks_total else 0 + res = [precision, recall, f1_score] + return res + + def _init_name(self, name): + name = name or 'chunk eval' + self._name = ['precision', 'recall', 'F1'] + + def name(self): + return self._name diff --git a/examples/sequence_tagging/train.py b/examples/sequence_tagging/train.py index 69b76b1..5626838 100644 --- a/examples/sequence_tagging/train.py +++ b/examples/sequence_tagging/train.py @@ -18,24 +18,14 @@ SequenceTagging network structure from __future__ import division from __future__ import print_function -import io -import os -import sys -import math -import argparse -import numpy as np - -work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(os.path.join(work_dir, "../")) - -from paddle.incubate.hapi.model import Input, set_device -from paddle.incubate.hapi.text.sequence_tagging import SeqTagging, LacLoss, ChunkEval -from paddle.incubate.hapi.text.sequence_tagging import LacDataset, LacDataLoader -from paddle.incubate.hapi.text.sequence_tagging import check_gpu, check_version -from paddle.incubate.hapi.text.sequence_tagging import PDConfig - import paddle.fluid as fluid from paddle.fluid.optimizer import AdamOptimizer +from paddle.incubate.hapi.model import Input, set_device + +from sequence_tagging import SeqTagging, LacLoss, ChunkEval +from reader import LacDataset, LacDataLoader +from utils.check import check_gpu, check_version +from utils.configure import PDConfig def main(args): @@ -44,17 +34,15 @@ def main(args): inputs = [ Input( - [None, None], 'int64', name='words'), Input( - [None], 'int64', name='length'), Input( - [None, None], 'int64', name='target') + [None, None], 'int64', name='words'), + Input( + [None], 'int64', name='length'), + Input( + [None, None], 'int64', name='target'), ] labels = [Input([None, None], 'int64', name='labels')] - feed_list = None if args.dynamic else [ - x.forward() for x in inputs + labels - ] - dataset = LacDataset(args) train_dataset = LacDataLoader(args, place, phase="train") @@ -95,6 +83,7 @@ if __name__ == '__main__': use_gpu = True if args.device == "gpu" else False check_gpu(use_gpu) - check_version() + # TODO: add check for 2.0.0-alpha0 if fluid.require_version support + # check_version() main(args) diff --git a/examples/sequence_tagging/utils/__init__.py b/examples/sequence_tagging/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/sequence_tagging/utils/check.py b/examples/sequence_tagging/utils/check.py new file mode 100644 index 0000000..ade4fc2 --- /dev/null +++ b/examples/sequence_tagging/utils/check.py @@ -0,0 +1,58 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle.fluid as fluid + +__all__ = ['check_gpu', 'check_version'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not fluid.is_compiled_with_cuda(): + print(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 2.0 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('2.0.0') + except Exception as e: + print(err) + sys.exit(1) diff --git a/examples/sequence_tagging/utils/configure.py b/examples/sequence_tagging/utils/configure.py new file mode 100644 index 0000000..17dfaa5 --- /dev/null +++ b/examples/sequence_tagging/utils/configure.py @@ -0,0 +1,356 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import argparse +import json +import yaml +import six +import logging + +logging_only_message = "%(message)s" +logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s" + + +class JsonConfig(object): + """ + A high-level api for handling json configure file. + """ + + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path) as json_file: + config_dict = json.load(json_file) + except: + raise IOError("Error in parsing bert model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def print_config(self): + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +class ArgumentGroup(object): + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, **kwargs): + type = str2bool if type == bool else type + self._group.add_argument( + "--" + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +class ArgConfig(object): + """ + A high-level api for handling argument configs. + """ + + def __init__(self): + parser = argparse.ArgumentParser() + + train_g = ArgumentGroup(parser, "training", "training options.") + train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") + train_g.add_arg("learning_rate", float, 5e-5, + "Learning rate used to train with warmup.") + train_g.add_arg( + "lr_scheduler", + str, + "linear_warmup_decay", + "scheduler of learning rate.", + choices=['linear_warmup_decay', 'noam_decay']) + train_g.add_arg("weight_decay", float, 0.01, + "Weight decay rate for L2 regularizer.") + train_g.add_arg( + "warmup_proportion", float, 0.1, + "Proportion of training steps to perform linear learning rate warmup for." + ) + train_g.add_arg("save_steps", int, 1000, + "The steps interval to save checkpoints.") + train_g.add_arg("use_fp16", bool, False, + "Whether to use fp16 mixed precision training.") + train_g.add_arg( + "loss_scaling", float, 1.0, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled." + ) + train_g.add_arg("pred_dir", str, None, + "Path to save the prediction results") + + log_g = ArgumentGroup(parser, "logging", "logging related.") + log_g.add_arg("skip_steps", int, 10, + "The steps interval to print loss.") + log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") + + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") + run_type_g.add_arg("use_cuda", bool, True, + "If set, use GPU for training.") + run_type_g.add_arg( + "use_fast_executor", bool, False, + "If set, use fast parallel executor (in experiment).") + run_type_g.add_arg( + "num_iteration_per_drop_scope", int, 1, + "Ihe iteration intervals to clean up temporary variables.") + run_type_g.add_arg("do_train", bool, True, + "Whether to perform training.") + run_type_g.add_arg("do_predict", bool, True, + "Whether to perform prediction.") + + custom_g = ArgumentGroup(parser, "customize", "customized options.") + + self.custom_g = custom_g + + self.parser = parser + + def add_arg(self, name, dtype, default, descrip): + self.custom_g.add_arg(name, dtype, default, descrip) + + def build_conf(self): + return self.parser.parse_args() + + +def str2bool(v): + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +def print_arguments(args, log=None): + if not log: + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + else: + log.info('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +class PDConfig(object): + """ + A high-level API for managing configuration files in PaddlePaddle. + Can jointly work with command-line-arugment, json files and yaml files. + """ + + def __init__(self, json_file="", yaml_file="", fuse_args=True): + """ + Init funciton for PDConfig. + json_file: the path to the json configure file. + yaml_file: the path to the yaml configure file. + fuse_args: if fuse the json/yaml configs with argparse. + """ + assert isinstance(json_file, str) + assert isinstance(yaml_file, str) + + if json_file != "" and yaml_file != "": + raise Warning( + "json_file and yaml_file can not co-exist for now. please only use one configure file type." + ) + return + + self.args = None + self.arg_config = {} + self.json_config = {} + self.yaml_config = {} + + parser = argparse.ArgumentParser() + + self.default_g = ArgumentGroup(parser, "default", "default options.") + self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.") + self.json_g = ArgumentGroup(parser, "json", "options from json.") + self.com_g = ArgumentGroup(parser, "custom", "customized options.") + + self.default_g.add_arg("do_train", bool, False, + "Whether to perform training.") + self.default_g.add_arg("do_predict", bool, False, + "Whether to perform predicting.") + self.default_g.add_arg("do_eval", bool, False, + "Whether to perform evaluating.") + self.default_g.add_arg( + "do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") + + # NOTE: args for profiler + self.default_g.add_arg( + "is_profiler", int, 0, + "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg( + "profiler_path", str, './', + "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, + "the max train batch num.(used for benchmark)") + + self.parser = parser + + if json_file != "": + self.load_json(json_file, fuse_args=fuse_args) + + if yaml_file: + self.load_yaml(yaml_file, fuse_args=fuse_args) + + def load_json(self, file_path, fuse_args=True): + + if not os.path.exists(file_path): + raise Warning("the json file %s does not exist." % file_path) + return + + with open(file_path, "r") as fin: + self.json_config = json.loads(fin.read()) + fin.close() + + if fuse_args: + for name in self.json_config: + if isinstance(self.json_config[name], list): + self.json_g.add_arg( + name, + type(self.json_config[name][0]), + self.json_config[name], + "This is from %s" % file_path, + nargs=len(self.json_config[name])) + continue + if not isinstance(self.json_config[name], int) \ + and not isinstance(self.json_config[name], float) \ + and not isinstance(self.json_config[name], str) \ + and not isinstance(self.json_config[name], bool): + + continue + + self.json_g.add_arg(name, + type(self.json_config[name]), + self.json_config[name], + "This is from %s" % file_path) + + def load_yaml(self, file_path, fuse_args=True): + + if not os.path.exists(file_path): + raise Warning("the yaml file %s does not exist." % file_path) + return + + with open(file_path, "r") as fin: + self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) + fin.close() + + if fuse_args: + for name in self.yaml_config: + if isinstance(self.yaml_config[name], list): + self.yaml_g.add_arg( + name, + type(self.yaml_config[name][0]), + self.yaml_config[name], + "This is from %s" % file_path, + nargs=len(self.yaml_config[name])) + continue + + if not isinstance(self.yaml_config[name], int) \ + and not isinstance(self.yaml_config[name], float) \ + and not isinstance(self.yaml_config[name], str) \ + and not isinstance(self.yaml_config[name], bool): + + continue + + self.yaml_g.add_arg(name, + type(self.yaml_config[name]), + self.yaml_config[name], + "This is from %s" % file_path) + + def build(self): + self.args = self.parser.parse_args() + self.arg_config = vars(self.args) + + def __add__(self, new_arg): + assert isinstance(new_arg, list) or isinstance(new_arg, tuple) + assert len(new_arg) >= 3 + assert self.args is None + + name = new_arg[0] + dtype = new_arg[1] + dvalue = new_arg[2] + desc = new_arg[3] if len( + new_arg) == 4 else "Description is not provided." + + self.com_g.add_arg(name, dtype, dvalue, desc) + + return self + + def __getattr__(self, name): + if name in self.arg_config: + return self.arg_config[name] + + if name in self.json_config: + return self.json_config[name] + + if name in self.yaml_config: + return self.yaml_config[name] + + raise Warning("The argument %s is not defined." % name) + + def Print(self): + + print("-" * 70) + for name in self.arg_config: + print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name]))) + + for name in self.json_config: + if name not in self.arg_config: + print("%s:\t\t\t\t%s" % + (str(name), str(self.json_config[name]))) + + for name in self.yaml_config: + if name not in self.arg_config: + print("%s:\t\t\t\t%s" % + (str(name), str(self.yaml_config[name]))) + + print("-" * 70) + + +if __name__ == "__main__": + """ + pd_config = PDConfig(json_file = "./test/bert_config.json") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + + pd_config = PDConfig(yaml_file = "./test/bert_config.yaml") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + """ + + pd_config = PDConfig(yaml_file="./test/bert_config.yaml") + pd_config += ("my_age", int, 18, "I am forever 18.") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + print(pd_config.my_age) diff --git a/examples/sequence_tagging/utils/metrics.py b/examples/sequence_tagging/utils/metrics.py new file mode 100644 index 0000000..2b64223 --- /dev/null +++ b/examples/sequence_tagging/utils/metrics.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle.fluid as fluid + +__all__ = ['chunk_count', "build_chunk"] + + +def build_chunk(data_list, id2label_dict): + """ + Assembly entity + """ + tag_list = [id2label_dict.get(str(id)) for id in data_list] + ner_dict = {} + ner_str = "" + ner_start = 0 + for i in range(len(tag_list)): + tag = tag_list[i] + if tag == u"O": + if i != 0: + key = "%d_%d" % (ner_start, i - 1) + ner_dict[key] = ner_str + ner_start = i + ner_str = tag + elif tag.endswith(u"B"): + if i != 0: + key = "%d_%d" % (ner_start, i - 1) + ner_dict[key] = ner_str + ner_start = i + ner_str = tag.split('-')[0] + elif tag.endswith(u"I"): + if tag.split('-')[0] != ner_str: + if i != 0: + key = "%d_%d" % (ner_start, i - 1) + ner_dict[key] = ner_str + ner_start = i + ner_str = tag.split('-')[0] + return ner_dict + + +def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict): + """ + calculate num_correct_chunks num_error_chunks total_num for metrics + """ + num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0 + assert infer_numpy.shape[0] == label_numpy.shape[0] + + for i in range(infer_numpy.shape[0]): + infer_list = infer_numpy[i][:seq_len[i]] + label_list = label_numpy[i][:seq_len[i]] + infer_dict = build_chunk(infer_list, id2label_dict) + num_infer_chunks += len(infer_dict) + label_dict = build_chunk(label_list, id2label_dict) + num_label_chunks += len(label_dict) + for key in infer_dict: + if key in label_dict and label_dict[key] == infer_dict[key]: + num_correct_chunks += 1 + return num_infer_chunks, num_label_chunks, num_correct_chunks diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py index 02405d0..d7c389c 100644 --- a/examples/transformer/transformer.py +++ b/examples/transformer/transformer.py @@ -18,9 +18,9 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers -from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable -from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from paddle.incubate.hapi.model import Model, CrossEntropy, Loss +from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.loss import Loss from paddle.incubate.hapi.text import TransformerBeamSearchDecoder, DynamicDecode @@ -43,31 +43,6 @@ def position_encoding_init(n_position, d_pos_vec): return position_enc.astype("float32") -class NoamDecay(LearningRateDecay): - """ - learning rate scheduler - """ - - def __init__(self, - d_model, - warmup_steps, - static_lr=2.0, - begin=1, - step=1, - dtype='float32'): - super(NoamDecay, self).__init__(begin, step, dtype) - self.d_model = d_model - self.warmup_steps = warmup_steps - self.static_lr = static_lr - - def step(self): - a = self.create_lr_var(self.step_num**-0.5) - b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num) - lr_value = (self.d_model**-0.5) * layers.elementwise_min( - a, b) * self.static_lr - return lr_value - - class PrePostProcessLayer(Layer): """ PrePostProcessLayer -- GitLab