diff --git a/examples/sequence_tagging/README.md b/examples/sequence_tagging/README.md index b36e9cda77efe701dcc1e342e65f50c70fd69c8d..387c0a36f18f6e598b94f4c9ae558b535d7e67d9 100644 --- a/examples/sequence_tagging/README.md +++ b/examples/sequence_tagging/README.md @@ -186,14 +186,12 @@ Overall Architecture of GRU-CRF-MODEL ├── data/ # 存放数据集的目录 ├── conf/ # 词典及程序默认配置的目录 ├── images/ # 文档图片存放位置 -├── utils/ # 常用工具函数 ├── train.py # 训练脚本 ├── predict.py # 预测脚本 ├── eval.py # 词法分析评估的脚本 ├── downloads.py # 用于下载数据和模型的脚本 ├── downloads.sh # 用于下载数据和模型的脚本 -├── sequence_tagging.yaml # 模型训练、预测、评估相关配置参数 -└──reader.py # 文件读取相关函数 +└── sequence_tagging.yaml # 模型训练、预测、评估相关配置参数 ``` diff --git a/examples/sequence_tagging/eval.py b/examples/sequence_tagging/eval.py index b1e617bdc45dbad1a5f966a399c2168dbf02bb54..f58337be83b49978494bb1e9a9634ecb9256b909 100644 --- a/examples/sequence_tagging/eval.py +++ b/examples/sequence_tagging/eval.py @@ -25,14 +25,14 @@ import math import argparse import numpy as np -from train import SeqTagging, ChunkEval, LacLoss -from utils.configure import PDConfig -from utils.check import check_gpu, check_version -from reader import LacDataset, LacDataLoader - work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) + from hapi.model import set_device, Input +from hapi.text.sequence_tagging import SeqTagging, ChunkEval, LacLoss +from hapi.text.sequence_tagging import LacDataset, LacDataLoader +from hapi.text.sequence_tagging import check_gpu, check_version +from hapi.text.sequence_tagging import PDConfig import paddle.fluid as fluid from paddle.fluid.layers.utils import flatten @@ -65,7 +65,10 @@ def main(args): device=place) model.load(args.init_from_checkpoint, skip_mismatch=True) - model.evaluate(eval_dataset.dataloader, batch_size=args.batch_size) + eval_result = model.evaluate(eval_dataset.dataloader, batch_size=args.batch_size) + print("precison: %.5f" % (eval_result["precision"][0])) + print("recall: %.5f" % (eval_result["recall"][0])) + print("F1: %.5f" % (eval_result["F1"][0])) if __name__ == '__main__': diff --git a/examples/sequence_tagging/predict.py b/examples/sequence_tagging/predict.py index 5067eb7c844972dd2a625901e841196b527c6e8a..e8802e4a91ee4fbd339be29a83914cf24b9839c1 100644 --- a/examples/sequence_tagging/predict.py +++ b/examples/sequence_tagging/predict.py @@ -26,14 +26,14 @@ import math import argparse import numpy as np -from train import SeqTagging -from utils.check import check_gpu, check_version -from utils.configure import PDConfig -from reader import LacDataset, LacDataLoader - work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) -from hapi.model import set_device, Input + +from hapi.text.sequence_tagging import SeqTagging +from hapi.model import Input, set_device +from hapi.text.sequence_tagging import LacDataset, LacDataLoader +from hapi.text.sequence_tagging import check_gpu, check_version +from hapi.text.sequence_tagging import PDConfig import paddle.fluid as fluid from paddle.fluid.layers.utils import flatten diff --git a/examples/sequence_tagging/train.py b/examples/sequence_tagging/train.py index 41422fc7d722a2ebea606080151da6807156ad18..56507ad05b96baab8d7d262da3b7655e82df6f23 100644 --- a/examples/sequence_tagging/train.py +++ b/examples/sequence_tagging/train.py @@ -28,183 +28,15 @@ import numpy as np work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.join(work_dir, "../")) -from hapi.metrics import Metric -from hapi.model import Model, Input, set_device -from hapi.loss import Loss -from hapi.text.text import SequenceTagging - -from utils.check import check_gpu, check_version -from utils.configure import PDConfig -from reader import LacDataset, LacDataLoader +from hapi.model import Input, set_device +from hapi.text.sequence_tagging import SeqTagging, LacLoss, ChunkEval +from hapi.text.sequence_tagging import LacDataset, LacDataLoader +from hapi.text.sequence_tagging import check_gpu, check_version +from hapi.text.sequence_tagging import PDConfig import paddle.fluid as fluid from paddle.fluid.optimizer import AdamOptimizer -__all__ = ["SeqTagging", "LacLoss", "ChunkEval"] - - -class SeqTagging(Model): - def __init__(self, args, vocab_size, num_labels, length=None, - mode="train"): - super(SeqTagging, self).__init__() - """ - define the lexical analysis network structure - word: stores the input of the model - for_infer: a boolean value, indicating if the model to be created is for training or predicting. - - return: - for infer: return the prediction - otherwise: return the prediction - """ - self.mode_type = mode - self.word_emb_dim = args.word_emb_dim - self.vocab_size = vocab_size - self.num_labels = num_labels - self.grnn_hidden_dim = args.grnn_hidden_dim - self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( - args) else 1.0 - self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( - args) else 1.0 - self.bigru_num = args.bigru_num - self.batch_size = args.batch_size - self.init_bound = 0.1 - self.length = length - - self.sequence_tagging = SequenceTagging( - vocab_size=self.vocab_size, - num_labels=self.num_labels, - batch_size=self.batch_size, - word_emb_dim=self.word_emb_dim, - grnn_hidden_dim=self.grnn_hidden_dim, - emb_learning_rate=self.emb_lr, - crf_learning_rate=self.crf_lr, - bigru_num=self.bigru_num, - init_bound=self.init_bound, - length=self.length) - - def forward(self, *inputs): - """ - Configure the network - """ - word = inputs[0] - lengths = inputs[1] - if self.mode_type == "train" or self.mode_type == "test": - target = inputs[2] - outputs = self.sequence_tagging(word, lengths, target) - else: - outputs = self.sequence_tagging(word, lengths) - return outputs - - -class Chunk_eval(fluid.dygraph.Layer): - def __init__(self, - num_chunk_types, - chunk_scheme, - excluded_chunk_types=None): - super(Chunk_eval, self).__init__() - self.num_chunk_types = num_chunk_types - self.chunk_scheme = chunk_scheme - self.excluded_chunk_types = excluded_chunk_types - - def forward(self, input, label, seq_length=None): - precision = self._helper.create_variable_for_type_inference( - dtype="float32") - recall = self._helper.create_variable_for_type_inference( - dtype="float32") - f1_score = self._helper.create_variable_for_type_inference( - dtype="float32") - num_infer_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - num_label_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - num_correct_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - this_input = {"Inference": input, "Label": label} - if seq_length is not None: - this_input["SeqLength"] = seq_length - self._helper.append_op( - type='chunk_eval', - inputs=this_input, - outputs={ - "Precision": [precision], - "Recall": [recall], - "F1-Score": [f1_score], - "NumInferChunks": [num_infer_chunks], - "NumLabelChunks": [num_label_chunks], - "NumCorrectChunks": [num_correct_chunks] - }, - attrs={ - "num_chunk_types": self.num_chunk_types, - "chunk_scheme": self.chunk_scheme, - "excluded_chunk_types": self.excluded_chunk_types or [] - }) - return (num_infer_chunks, num_label_chunks, num_correct_chunks) - - -class LacLoss(Loss): - def __init__(self): - super(LacLoss, self).__init__() - pass - - def forward(self, outputs, labels): - avg_cost = outputs[1] - return avg_cost - - -class ChunkEval(Metric): - def __init__(self, num_labels, name=None, *args, **kwargs): - super(ChunkEval, self).__init__(*args, **kwargs) - self._init_name(name) - self.chunk_eval = Chunk_eval( - int(math.ceil((num_labels - 1) / 2.0)), "IOB") - self.reset() - - def add_metric_op(self, *args): - crf_decode = args[0] - lengths = args[2] - label = args[3] - (num_infer_chunks, num_label_chunks, - num_correct_chunks) = self.chunk_eval( - input=crf_decode, label=label, seq_length=lengths) - return [num_infer_chunks, num_label_chunks, num_correct_chunks] - - def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, - *args, **kwargs): - self.infer_chunks_total += num_infer_chunks - self.label_chunks_total += num_label_chunks - self.correct_chunks_total += num_correct_chunks - precision = float( - num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 - recall = float( - num_correct_chunks) / num_label_chunks if num_label_chunks else 0 - f1_score = float(2 * precision * recall) / ( - precision + recall) if num_correct_chunks else 0 - return [precision, recall, f1_score] - - def reset(self): - self.infer_chunks_total = 0 - self.label_chunks_total = 0 - self.correct_chunks_total = 0 - - def accumulate(self): - precision = float( - self.correct_chunks_total - ) / self.infer_chunks_total if self.infer_chunks_total else 0 - recall = float( - self.correct_chunks_total - ) / self.label_chunks_total if self.label_chunks_total else 0 - f1_score = float(2 * precision * recall) / ( - precision + recall) if self.correct_chunks_total else 0 - res = [precision, recall, f1_score] - return res - - def _init_name(self, name): - name = name or 'chunk eval' - self._name = ['precision', 'recall', 'F1'] - - def name(self): - return self._name - def main(args): place = set_device(args.device) diff --git a/hapi/text/sequence_tagging/__init__.py b/hapi/text/sequence_tagging/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18d16eeefe522f30afc8afba5c6cf3be86db65ae --- /dev/null +++ b/hapi/text/sequence_tagging/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from hapi.text.sequence_tagging.reader import LacDataset as LacDataset +from hapi.text.sequence_tagging.reader import LacDataLoader as LacDataLoader +from hapi.text.sequence_tagging.sequence_tagging import SeqTagging as SeqTagging +from hapi.text.sequence_tagging.sequence_tagging import Chunk_eval as Chunk_eval +from hapi.text.sequence_tagging.sequence_tagging import LacLoss as LacLoss +from hapi.text.sequence_tagging.sequence_tagging import ChunkEval as ChunkEval +from hapi.text.sequence_tagging.utils.configure import PDConfig as PDConfig +from hapi.text.sequence_tagging.utils.check import check_gpu as check_gpu +from hapi.text.sequence_tagging.utils.check import check_version as check_version + diff --git a/examples/sequence_tagging/reader.py b/hapi/text/sequence_tagging/reader.py similarity index 100% rename from examples/sequence_tagging/reader.py rename to hapi/text/sequence_tagging/reader.py diff --git a/hapi/text/sequence_tagging/sequence_tagging.py b/hapi/text/sequence_tagging/sequence_tagging.py new file mode 100644 index 0000000000000000000000000000000000000000..e4cd3cc363e04c2fc4e92c2b03f52e2896efad01 --- /dev/null +++ b/hapi/text/sequence_tagging/sequence_tagging.py @@ -0,0 +1,201 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SequenceTagging network structure +""" + +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import math +import argparse +import numpy as np + +from hapi.metrics import Metric +from hapi.model import Model, Input, set_device +from hapi.loss import Loss +from hapi.text.text import SequenceTagging + +from hapi.text.sequence_tagging.utils.check import check_gpu, check_version +from hapi.text.sequence_tagging.utils.configure import PDConfig + +import paddle.fluid as fluid +from paddle.fluid.optimizer import AdamOptimizer + + +class SeqTagging(Model): + def __init__(self, args, vocab_size, num_labels, length=None, + mode="train"): + super(SeqTagging, self).__init__() + """ + define the lexical analysis network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + self.mode_type = mode + self.word_emb_dim = args.word_emb_dim + self.vocab_size = vocab_size + self.num_labels = num_labels + self.grnn_hidden_dim = args.grnn_hidden_dim + self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( + args) else 1.0 + self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( + args) else 1.0 + self.bigru_num = args.bigru_num + self.batch_size = args.batch_size + self.init_bound = 0.1 + self.length = length + + self.sequence_tagging = SequenceTagging( + vocab_size=self.vocab_size, + num_labels=self.num_labels, + batch_size=self.batch_size, + word_emb_dim=self.word_emb_dim, + grnn_hidden_dim=self.grnn_hidden_dim, + emb_learning_rate=self.emb_lr, + crf_learning_rate=self.crf_lr, + bigru_num=self.bigru_num, + init_bound=self.init_bound, + length=self.length) + + def forward(self, *inputs): + """ + Configure the network + """ + word = inputs[0] + lengths = inputs[1] + if self.mode_type == "train" or self.mode_type == "test": + target = inputs[2] + outputs = self.sequence_tagging(word, lengths, target) + else: + outputs = self.sequence_tagging(word, lengths) + return outputs + + +class Chunk_eval(fluid.dygraph.Layer): + def __init__(self, + num_chunk_types, + chunk_scheme, + excluded_chunk_types=None): + super(Chunk_eval, self).__init__() + self.num_chunk_types = num_chunk_types + self.chunk_scheme = chunk_scheme + self.excluded_chunk_types = excluded_chunk_types + + def forward(self, input, label, seq_length=None): + precision = self._helper.create_variable_for_type_inference( + dtype="float32") + recall = self._helper.create_variable_for_type_inference( + dtype="float32") + f1_score = self._helper.create_variable_for_type_inference( + dtype="float32") + num_infer_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_label_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_correct_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + this_input = {"Inference": input, "Label": label} + if seq_length is not None: + this_input["SeqLength"] = seq_length + self._helper.append_op( + type='chunk_eval', + inputs=this_input, + outputs={ + "Precision": [precision], + "Recall": [recall], + "F1-Score": [f1_score], + "NumInferChunks": [num_infer_chunks], + "NumLabelChunks": [num_label_chunks], + "NumCorrectChunks": [num_correct_chunks] + }, + attrs={ + "num_chunk_types": self.num_chunk_types, + "chunk_scheme": self.chunk_scheme, + "excluded_chunk_types": self.excluded_chunk_types or [] + }) + return (num_infer_chunks, num_label_chunks, num_correct_chunks) + + +class LacLoss(Loss): + def __init__(self): + super(LacLoss, self).__init__() + pass + + def forward(self, outputs, labels): + avg_cost = outputs[1] + return avg_cost + + +class ChunkEval(Metric): + def __init__(self, num_labels, name=None, *args, **kwargs): + super(ChunkEval, self).__init__(*args, **kwargs) + self._init_name(name) + self.chunk_eval = Chunk_eval( + int(math.ceil((num_labels - 1) / 2.0)), "IOB") + self.reset() + + def add_metric_op(self, *args): + crf_decode = args[0] + lengths = args[2] + label = args[3] + (num_infer_chunks, num_label_chunks, + num_correct_chunks) = self.chunk_eval( + input=crf_decode, label=label, seq_length=lengths) + return [num_infer_chunks, num_label_chunks, num_correct_chunks] + + def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, + *args, **kwargs): + self.infer_chunks_total += num_infer_chunks + self.label_chunks_total += num_label_chunks + self.correct_chunks_total += num_correct_chunks + precision = float( + num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 + recall = float( + num_correct_chunks) / num_label_chunks if num_label_chunks else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if num_correct_chunks else 0 + return [precision, recall, f1_score] + + def reset(self): + self.infer_chunks_total = 0 + self.label_chunks_total = 0 + self.correct_chunks_total = 0 + + def accumulate(self): + precision = float( + self.correct_chunks_total + ) / self.infer_chunks_total if self.infer_chunks_total else 0 + recall = float( + self.correct_chunks_total + ) / self.label_chunks_total if self.label_chunks_total else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if self.correct_chunks_total else 0 + res = [precision, recall, f1_score] + return res + + def _init_name(self, name): + name = name or 'chunk eval' + self._name = ['precision', 'recall', 'F1'] + + def name(self): + return self._name + diff --git a/examples/sequence_tagging/utils/__init__.py b/hapi/text/sequence_tagging/utils/__init__.py similarity index 100% rename from examples/sequence_tagging/utils/__init__.py rename to hapi/text/sequence_tagging/utils/__init__.py diff --git a/examples/sequence_tagging/utils/check.py b/hapi/text/sequence_tagging/utils/check.py similarity index 100% rename from examples/sequence_tagging/utils/check.py rename to hapi/text/sequence_tagging/utils/check.py diff --git a/examples/sequence_tagging/utils/configure.py b/hapi/text/sequence_tagging/utils/configure.py similarity index 100% rename from examples/sequence_tagging/utils/configure.py rename to hapi/text/sequence_tagging/utils/configure.py diff --git a/examples/sequence_tagging/utils/metrics.py b/hapi/text/sequence_tagging/utils/metrics.py similarity index 100% rename from examples/sequence_tagging/utils/metrics.py rename to hapi/text/sequence_tagging/utils/metrics.py