diff --git a/lac.py b/lac.py deleted file mode 100644 index cdd380686256b2039f6aa7f2289639559969a6a8..0000000000000000000000000000000000000000 --- a/lac.py +++ /dev/null @@ -1,728 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -lexical analysis network structure -""" - -from __future__ import division -from __future__ import print_function - -import io -import os -import sys -import math -import argparse -import numpy as np - -from metrics import Metric -from model import Model, Input, Loss, set_device - -import paddle.fluid as fluid -from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.initializer import NormalInitializer -from paddle.fluid.dygraph.nn import Embedding, Linear, GRUUnit - - -class DynamicGRU(fluid.dygraph.Layer): - def __init__(self, - size, - h_0=None, - param_attr=None, - bias_attr=None, - is_reverse=False, - gate_activation='sigmoid', - candidate_activation='tanh', - origin_mode=False, - init_size=None): - super(DynamicGRU, self).__init__() - - self.gru_unit = GRUUnit( - size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode) - - self.size = size - self.h_0 = h_0 - self.is_reverse = is_reverse - - def forward(self, inputs): - hidden = self.h_0 - res = [] - - for i in range(inputs.shape[1]): - if self.is_reverse: - i = inputs.shape[1] - 1 - i - input_ = inputs[:, i:i + 1, :] - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) - hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) - res.append(hidden_) - if self.is_reverse: - res = res[::-1] - res = fluid.layers.concat(res, axis=1) - return res - - -class BiGRU(fluid.dygraph.Layer): - def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None): - super(BiGRU, self).__init__() - - self.pre_gru = Linear( - input_dim=input_dim, - output_dim=grnn_hidden_dim * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - self.gru = DynamicGRU( - size=grnn_hidden_dim, - h_0=h_0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - self.pre_gru_r = Linear( - input_dim=input_dim, - output_dim=grnn_hidden_dim * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - self.gru_r = DynamicGRU( - size=grnn_hidden_dim, - is_reverse=True, - h_0=h_0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - def forward(self, input_feature): - res_pre_gru = self.pre_gru(input_feature) - res_gru = self.gru(res_pre_gru) - res_pre_gru_r = self.pre_gru_r(input_feature) - res_gru_r = self.gru_r(res_pre_gru_r) - bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1) - return bi_merge - - -class Linear_chain_crf(fluid.dygraph.Layer): - def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): - super(Linear_chain_crf, self).__init__() - - self._param_attr = param_attr - self._dtype = dtype - self._size = size - self._is_test = is_test - self._transition = self.create_parameter( - attr=self._param_attr, - shape=[self._size + 2, self._size], - dtype=self._dtype) - - @property - def weight(self): - return self._transition - - @weight.setter - def weight(self, value): - self._transition = value - - def forward(self, input, label, length=None): - - alpha = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - emission_exps = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - transition_exps = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - log_likelihood = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - this_inputs = { - "Emission": [input], - "Transition": self._transition, - "Label": [label] - } - if length: - this_inputs['Length'] = [length] - self._helper.append_op( - type='linear_chain_crf', - inputs=this_inputs, - outputs={ - "Alpha": [alpha], - "EmissionExps": [emission_exps], - "TransitionExps": transition_exps, - "LogLikelihood": log_likelihood - }, - attrs={"is_test": self._is_test, }) - return log_likelihood - - -class Crf_decoding(fluid.dygraph.Layer): - def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): - super(Crf_decoding, self).__init__() - - self._dtype = dtype - self._size = size - self._is_test = is_test - self._param_attr = param_attr - self._transition = self.create_parameter( - attr=self._param_attr, - shape=[self._size + 2, self._size], - dtype=self._dtype) - - @property - def weight(self): - return self._transition - - @weight.setter - def weight(self, value): - self._transition = value - - def forward(self, input, label=None, length=None): - - viterbi_path = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - this_inputs = { - "Emission": [input], - "Transition": self._transition, - "Label": label - } - if length: - this_inputs['Length'] = [length] - self._helper.append_op( - type='crf_decoding', - inputs=this_inputs, - outputs={"ViterbiPath": [viterbi_path]}, - attrs={"is_test": self._is_test, }) - return viterbi_path - - -class Chunk_eval(fluid.dygraph.Layer): - def __init__(self, - num_chunk_types, - chunk_scheme, - excluded_chunk_types=None): - super(Chunk_eval, self).__init__() - self.num_chunk_types = num_chunk_types - self.chunk_scheme = chunk_scheme - self.excluded_chunk_types = excluded_chunk_types - - def forward(self, input, label, seq_length=None): - precision = self._helper.create_variable_for_type_inference( - dtype="float32") - recall = self._helper.create_variable_for_type_inference( - dtype="float32") - f1_score = self._helper.create_variable_for_type_inference( - dtype="float32") - num_infer_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - num_label_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - num_correct_chunks = self._helper.create_variable_for_type_inference( - dtype="int64") - - this_input = {"Inference": input, "Label": label[0]} - if seq_length: - this_input["SeqLength"] = seq_length[0] - self._helper.append_op( - type='chunk_eval', - inputs=this_input, - outputs={ - "Precision": [precision], - "Recall": [recall], - "F1-Score": [f1_score], - "NumInferChunks": [num_infer_chunks], - "NumLabelChunks": [num_label_chunks], - "NumCorrectChunks": [num_correct_chunks] - }, - attrs={ - "num_chunk_types": self.num_chunk_types, - "chunk_scheme": self.chunk_scheme, - "excluded_chunk_types": self.excluded_chunk_types or [] - }) - return (num_infer_chunks, num_label_chunks, num_correct_chunks) - - -class LAC(Model): - def __init__(self, args, vocab_size, num_labels, length=None): - super(LAC, self).__init__() - """ - define the lexical analysis network structure - word: stores the input of the model - for_infer: a boolean value, indicating if the model to be created is for training or predicting. - - return: - for infer: return the prediction - otherwise: return the prediction - """ - self.word_emb_dim = args.word_emb_dim - self.vocab_size = vocab_size - self.num_labels = num_labels - self.grnn_hidden_dim = args.grnn_hidden_dim - self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( - args) else 1.0 - self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( - args) else 1.0 - self.bigru_num = args.bigru_num - self.init_bound = 0.1 - - self.word_embedding = Embedding( - size=[self.vocab_size, self.word_emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr( - learning_rate=self.emb_lr, - name="word_emb", - initializer=fluid.initializer.Uniform( - low=-self.init_bound, high=self.init_bound))) - - h_0 = fluid.layers.create_global_var( - shape=[args.batch_size, self.grnn_hidden_dim], - value=0.0, - dtype='float32', - persistable=True, - force_cpu=True, - name='h_0') - - self.bigru_units = [] - for i in range(self.bigru_num): - if i == 0: - self.bigru_units.append( - self.add_sublayer( - "bigru_units%d" % i, - BiGRU( - self.grnn_hidden_dim, - self.grnn_hidden_dim, - self.init_bound, - h_0=h_0))) - else: - self.bigru_units.append( - self.add_sublayer( - "bigru_units%d" % i, - BiGRU( - self.grnn_hidden_dim * 2, - self.grnn_hidden_dim, - self.init_bound, - h_0=h_0))) - - self.fc = Linear( - input_dim=self.grnn_hidden_dim * 2, - output_dim=self.num_labels, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-self.init_bound, high=self.init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - self.linear_chain_crf = Linear_chain_crf( - param_attr=fluid.ParamAttr( - name='linear_chain_crfw', learning_rate=self.crf_lr), - size=self.num_labels) - - self.crf_decoding = Crf_decoding( - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=self.crf_lr), - size=self.num_labels) - - def forward(self, word, target, lengths): - """ - Configure the network - """ - word_embed = self.word_embedding(word) - input_feature = word_embed - - for i in range(self.bigru_num): - bigru_output = self.bigru_units[i](input_feature) - input_feature = bigru_output - - emission = self.fc(bigru_output) - - crf_cost = self.linear_chain_crf( - input=emission, label=target, length=lengths) - avg_cost = fluid.layers.mean(x=crf_cost) - self.crf_decoding.weight = self.linear_chain_crf.weight - crf_decode = self.crf_decoding(input=emission, length=lengths) - return crf_decode, avg_cost, lengths - - -class LacLoss(Loss): - def __init__(self): - super(LacLoss, self).__init__() - pass - - def forward(self, outputs, labels): - avg_cost = outputs[1] - return avg_cost - - -class ChunkEval(Metric): - def __init__(self, num_labels, name=None, *args, **kwargs): - super(ChunkEval, self).__init__(*args, **kwargs) - self._init_name(name) - self.chunk_eval = Chunk_eval( - int(math.ceil((num_labels - 1) / 2.0)), "IOB") - self.reset() - - def add_metric_op(self, pred, label, *args, **kwargs): - crf_decode = pred[0] - lengths = pred[2] - (num_infer_chunks, num_label_chunks, - num_correct_chunks) = self.chunk_eval( - input=crf_decode, label=label, seq_length=lengths) - return [num_infer_chunks, num_label_chunks, num_correct_chunks] - - def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, - *args, **kwargs): - self.infer_chunks_total += num_infer_chunks - self.label_chunks_total += num_label_chunks - self.correct_chunks_total += num_correct_chunks - precision = float( - num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 - recall = float( - num_correct_chunks) / num_label_chunks if num_label_chunks else 0 - f1_score = float(2 * precision * recall) / ( - precision + recall) if num_correct_chunks else 0 - return [precision, recall, f1_score] - - def reset(self): - self.infer_chunks_total = 0 - self.label_chunks_total = 0 - self.correct_chunks_total = 0 - - def accumulate(self): - precision = float( - self.correct_chunks_total - ) / self.infer_chunks_total if self.infer_chunks_total else 0 - recall = float( - self.correct_chunks_total - ) / self.label_chunks_total if self.label_chunks_total else 0 - f1_score = float(2 * precision * recall) / ( - precision + recall) if self.correct_chunks_total else 0 - res = [precision, recall, f1_score] - return res - - def _init_name(self, name): - name = name or 'chunk eval' - self._name = ['precision', 'recall', 'F1'] - - def name(self): - return self._name - - -class LacDataset(object): - """ - Load lexical analysis dataset - """ - - def __init__(self, args): - self.word_dict_path = args.word_dict_path - self.label_dict_path = args.label_dict_path - self.word_rep_dict_path = args.word_rep_dict_path - self._load_dict() - - def _load_dict(self): - self.word2id_dict = self.load_kv_dict( - self.word_dict_path, reverse=True, value_func=np.int64) - self.id2word_dict = self.load_kv_dict(self.word_dict_path) - self.label2id_dict = self.load_kv_dict( - self.label_dict_path, reverse=True, value_func=np.int64) - self.id2label_dict = self.load_kv_dict(self.label_dict_path) - if self.word_rep_dict_path is None: - self.word_replace_dict = dict() - else: - self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path) - - def load_kv_dict(self, - dict_path, - reverse=False, - delimiter="\t", - key_func=None, - value_func=None): - """ - Load key-value dict from file - """ - result_dict = {} - for line in io.open(dict_path, "r", encoding='utf8'): - terms = line.strip("\n").split(delimiter) - if len(terms) != 2: - continue - if reverse: - value, key = terms - else: - key, value = terms - if key in result_dict: - raise KeyError("key duplicated with [%s]" % (key)) - if key_func: - key = key_func(key) - if value_func: - value = value_func(value) - result_dict[key] = value - return result_dict - - @property - def vocab_size(self): - return len(self.word2id_dict.values()) - - @property - def num_labels(self): - return len(self.label2id_dict.values()) - - def get_num_examples(self, filename): - """num of line of file""" - return sum(1 for line in io.open(filename, "r", encoding='utf8')) - - def word_to_ids(self, words): - """convert word to word index""" - word_ids = [] - for word in words: - word = self.word_replace_dict.get(word, word) - if word not in self.word2id_dict: - word = "OOV" - word_id = self.word2id_dict[word] - word_ids.append(word_id) - - return word_ids - - def label_to_ids(self, labels): - """convert label to label index""" - label_ids = [] - for label in labels: - if label not in self.label2id_dict: - label = "O" - label_id = self.label2id_dict[label] - label_ids.append(label_id) - return label_ids - - def file_reader(self, - filename, - mode="train", - batch_size=32, - max_seq_len=126): - """ - yield (word_idx, target_idx) one by one from file, - or yield (word_idx, ) in `infer` mode - """ - - def wrapper(): - fread = io.open(filename, "r", encoding="utf-8") - headline = next(fread) - headline = headline.strip().split('\t') - assert len(headline) == 2 and headline[0] == "text_a" and headline[ - 1] == "label" - buf = [] - for line in fread: - words, labels = line.strip("\n").split("\t") - if len(words) < 1: - continue - word_ids = self.word_to_ids(words.split("\002")) - label_ids = self.label_to_ids(labels.split("\002")) - assert len(word_ids) == len(label_ids) - word_ids = word_ids[0:max_seq_len] - words_len = np.int64(len(word_ids)) - word_ids += [0 for _ in range(max_seq_len - words_len)] - label_ids = label_ids[0:max_seq_len] - label_ids += [0 for _ in range(max_seq_len - words_len)] - assert len(word_ids) == len(label_ids) - yield word_ids, label_ids, words_len - fread.close() - - return wrapper - - -def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): - def wrapper(): - batch_words, batch_labels, seq_lens = [], [], [] - for epoch in xrange(args.epoch): - for instance in reader.file_reader( - file_name, mode, max_seq_len=args.max_seq_len)(): - words, labels, words_len = instance - if len(seq_lens) < args.batch_size: - batch_words.append(words) - batch_labels.append(labels) - seq_lens.append(words_len) - if len(seq_lens) == args.batch_size: - yield batch_words, batch_labels, seq_lens, batch_labels - batch_words, batch_labels, seq_lens = [], [], [] - - if len(seq_lens) > 0: - yield batch_words, batch_labels, seq_lens, batch_labels - batch_words, batch_labels, seq_lens = [], [], [] - - return wrapper - - -def create_dataloader(generator, place, feed_list=None): - if not feed_list: - data_loader = fluid.io.DataLoader.from_generator( - capacity=50, - use_double_buffer=True, - iterable=True, - return_list=True) - else: - data_loader = fluid.io.DataLoader.from_generator( - feed_list=feed_list, - capacity=50, - use_double_buffer=True, - iterable=True, - return_list=True) - data_loader.set_batch_generator(generator, places=place) - return data_loader - - -def main(args): - place = set_device(args.device) - fluid.enable_dygraph(place) if args.dynamic else None - - inputs = [ - Input( - [None, args.max_seq_len], 'int64', name='words'), Input( - [None, args.max_seq_len], 'int64', name='target'), Input( - [None], 'int64', name='length') - ] - labels = [Input([None, args.max_seq_len], 'int64', name='labels')] - - feed = [x.forward() for x in inputs + labels] - dataset = LacDataset(args) - train_path = os.path.join(args.data, "train.tsv") - test_path = os.path.join(args.data, "test.tsv") - - if args.dynamic: - feed_list = None - else: - feed_list = feed - train_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=train_path, place=place, mode="train") - test_generator = create_lexnet_data_generator( - args, reader=dataset, file_name=test_path, place=place, mode="test") - - train_dataset = create_dataloader( - train_generator, place, feed_list=feed_list) - test_dataset = create_dataloader( - test_generator, place, feed_list=feed_list) - - vocab_size = dataset.vocab_size - num_labels = dataset.num_labels - model = LAC(args, vocab_size, num_labels) - - optim = AdamOptimizer( - learning_rate=args.base_learning_rate, - parameter_list=model.parameters()) - - model.prepare( - optim, - LacLoss(), - ChunkEval(num_labels), - inputs=inputs, - labels=labels, - device=args.device) - - if args.resume is not None: - model.load(args.resume) - - model.fit(train_dataset, - test_dataset, - epochs=args.epoch, - batch_size=args.batch_size, - eval_freq=args.eval_freq, - save_freq=args.save_freq, - save_dir=args.save_dir) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser("LAC training") - parser.add_argument( - "-dir", "--data", default=None, type=str, help='path to LAC dataset') - parser.add_argument( - "-wd", - "--word_dict_path", - default=None, - type=str, - help='word dict path') - parser.add_argument( - "-ld", - "--label_dict_path", - default=None, - type=str, - help='label dict path') - parser.add_argument( - "-wrd", - "--word_rep_dict_path", - default=None, - type=str, - help='The path of the word replacement Dictionary.') - parser.add_argument( - "-dev", - "--device", - type=str, - default='gpu', - help="device to use, gpu or cpu") - parser.add_argument( - "-d", "--dynamic", action='store_true', help="enable dygraph mode") - parser.add_argument( - "-e", "--epoch", default=10, type=int, help="number of epoch") - parser.add_argument( - '-lr', - '--base_learning_rate', - default=1e-3, - type=float, - metavar='LR', - help='initial learning rate') - parser.add_argument( - "--word_emb_dim", - default=128, - type=int, - help='word embedding dimension') - parser.add_argument( - "--grnn_hidden_dim", default=128, type=int, help="hidden dimension") - parser.add_argument( - "--bigru_num", default=2, type=int, help='the number of bi-rnn') - parser.add_argument("-elr", "--emb_learning_rate", default=1.0, type=float) - parser.add_argument("-clr", "--crf_learning_rate", default=1.0, type=float) - parser.add_argument( - "-b", "--batch_size", default=300, type=int, help="batch size") - parser.add_argument( - "--max_seq_len", default=126, type=int, help="max sequence length") - parser.add_argument( - "-n", "--num_devices", default=1, type=int, help="number of devices") - parser.add_argument( - "-r", - "--resume", - default=None, - type=str, - help="checkpoint path to resume") - parser.add_argument( - "-o", - "--save_dir", - default="./model", - type=str, - help="save model path") - parser.add_argument( - "-sf", "--save_freq", default=1, type=int, help="save frequency") - parser.add_argument( - "-ef", "--eval_freq", default=1, type=int, help="eval frequency") - - args = parser.parse_args() - print(args) - main(args) diff --git a/sequence_tagging/reader.py b/sequence_tagging/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..5cdba92dde33ed534a1a71a6588875e3ba4d2d3d --- /dev/null +++ b/sequence_tagging/reader.py @@ -0,0 +1,186 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SequenceTagging dataset +""" + +from __future__ import division +from __future__ import print_function + +import io +import numpy as np + +import paddle.fluid as fluid + + +class LacDataset(object): + """ + Load lexical analysis dataset + """ + + def __init__(self, args): + self.word_dict_path = args.word_dict_path + self.label_dict_path = args.label_dict_path + self.word_rep_dict_path = args.word_rep_dict_path + self._load_dict() + + def _load_dict(self): + self.word2id_dict = self.load_kv_dict( + self.word_dict_path, reverse=True, value_func=np.int64) + self.id2word_dict = self.load_kv_dict(self.word_dict_path) + self.label2id_dict = self.load_kv_dict( + self.label_dict_path, reverse=True, value_func=np.int64) + self.id2label_dict = self.load_kv_dict(self.label_dict_path) + if self.word_rep_dict_path is None: + self.word_replace_dict = dict() + else: + self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path) + + def load_kv_dict(self, + dict_path, + reverse=False, + delimiter="\t", + key_func=None, + value_func=None): + """ + Load key-value dict from file + """ + result_dict = {} + for line in io.open(dict_path, "r", encoding='utf8'): + terms = line.strip("\n").split(delimiter) + if len(terms) != 2: + continue + if reverse: + value, key = terms + else: + key, value = terms + if key in result_dict: + raise KeyError("key duplicated with [%s]" % (key)) + if key_func: + key = key_func(key) + if value_func: + value = value_func(value) + result_dict[key] = value + return result_dict + + @property + def vocab_size(self): + return len(self.word2id_dict.values()) + + @property + def num_labels(self): + return len(self.label2id_dict.values()) + + def get_num_examples(self, filename): + """num of line of file""" + return sum(1 for line in io.open(filename, "r", encoding='utf8')) + + def word_to_ids(self, words): + """convert word to word index""" + word_ids = [] + for word in words: + word = self.word_replace_dict.get(word, word) + if word not in self.word2id_dict: + word = "OOV" + word_id = self.word2id_dict[word] + word_ids.append(word_id) + + return word_ids + + def label_to_ids(self, labels): + """convert label to label index""" + label_ids = [] + for label in labels: + if label not in self.label2id_dict: + label = "O" + label_id = self.label2id_dict[label] + label_ids.append(label_id) + return label_ids + + def file_reader(self, + filename, + mode="train", + batch_size=32, + max_seq_len=126): + """ + yield (word_idx, target_idx) one by one from file, + or yield (word_idx, ) in `infer` mode + """ + + def wrapper(): + fread = io.open(filename, "r", encoding="utf-8") + headline = next(fread) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[0] == "text_a" and headline[ + 1] == "label" + buf = [] + for line in fread: + words, labels = line.strip("\n").split("\t") + if len(words) < 1: + continue + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + word_ids = word_ids[0:max_seq_len] + words_len = np.int64(len(word_ids)) + word_ids += [0 for _ in range(max_seq_len - words_len)] + label_ids = label_ids[0:max_seq_len] + label_ids += [0 for _ in range(max_seq_len - words_len)] + assert len(word_ids) == len(label_ids) + yield word_ids, label_ids, words_len + fread.close() + + return wrapper + + +def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): + def wrapper(): + batch_words, batch_labels, seq_lens = [], [], [] + for epoch in xrange(args.epoch): + for instance in reader.file_reader( + file_name, mode, max_seq_len=args.max_seq_len)(): + words, labels, words_len = instance + if len(seq_lens) < args.batch_size: + batch_words.append(words) + batch_labels.append(labels) + seq_lens.append(words_len) + if len(seq_lens) == args.batch_size: + yield batch_words, batch_labels, seq_lens, batch_labels + batch_words, batch_labels, seq_lens = [], [], [] + + if len(seq_lens) > 0: + yield batch_words, batch_labels, seq_lens, batch_labels + batch_words, batch_labels, seq_lens = [], [], [] + + return wrapper + + +def create_dataloader(generator, place, feed_list=None): + if not feed_list: + data_loader = fluid.io.DataLoader.from_generator( + capacity=50, + use_double_buffer=True, + iterable=True, + return_list=True) + else: + data_loader = fluid.io.DataLoader.from_generator( + feed_list=feed_list, + capacity=50, + use_double_buffer=True, + iterable=True, + return_list=True) + data_loader.set_batch_generator(generator, places=place) + return data_loader + + diff --git a/sequence_tagging/sequence_tagging.py b/sequence_tagging/sequence_tagging.py new file mode 100644 index 0000000000000000000000000000000000000000..f0928dd5b7b2d774b8416ad173dae5464172582e --- /dev/null +++ b/sequence_tagging/sequence_tagging.py @@ -0,0 +1,323 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SequenceTagging network structure +""" + +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import math +import argparse +import numpy as np +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from metrics import Metric +from model import Model, Input, Loss, set_device +from text import SequenceTagging +from reader import LacDataset, create_lexnet_data_generator, create_dataloader + +import paddle.fluid as fluid +from paddle.fluid.optimizer import AdamOptimizer + + +class SeqTagging(Model): + def __init__(self, args, vocab_size, num_labels, length=None): + super(SeqTagging, self).__init__() + """ + define the lexical analysis network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + self.word_emb_dim = args.word_emb_dim + self.vocab_size = vocab_size + self.num_labels = num_labels + self.grnn_hidden_dim = args.grnn_hidden_dim + self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( + args) else 1.0 + self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( + args) else 1.0 + self.bigru_num = args.bigru_num + self.batch_size = args.batch_size + self.init_bound = 0.1 + self.length=length + + self.sequence_tagging = SequenceTagging( + vocab_size=self.vocab_size, + num_labels=self.num_labels, + batch_size=self.batch_size, + word_emb_dim=self.word_emb_dim, + grnn_hidden_dim=self.grnn_hidden_dim, + emb_learning_rate=self.emb_lr, + crf_learning_rate=self.crf_lr, + bigru_num=self.bigru_num, + init_bound=self.init_bound, + length=self.length) + + def forward(self, word, target, lengths): + """ + Configure the network + """ + crf_decode, avg_cost, lengths = self.sequence_tagging(word, target, lengths) + return crf_decode, avg_cost, lengths + + +class Chunk_eval(fluid.dygraph.Layer): + def __init__(self, + num_chunk_types, + chunk_scheme, + excluded_chunk_types=None): + super(Chunk_eval, self).__init__() + self.num_chunk_types = num_chunk_types + self.chunk_scheme = chunk_scheme + self.excluded_chunk_types = excluded_chunk_types + + def forward(self, input, label, seq_length=None): + precision = self._helper.create_variable_for_type_inference( + dtype="float32") + recall = self._helper.create_variable_for_type_inference( + dtype="float32") + f1_score = self._helper.create_variable_for_type_inference( + dtype="float32") + num_infer_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_label_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_correct_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + + this_input = {"Inference": input, "Label": label[0]} + if seq_length: + this_input["SeqLength"] = seq_length[0] + self._helper.append_op( + type='chunk_eval', + inputs=this_input, + outputs={ + "Precision": [precision], + "Recall": [recall], + "F1-Score": [f1_score], + "NumInferChunks": [num_infer_chunks], + "NumLabelChunks": [num_label_chunks], + "NumCorrectChunks": [num_correct_chunks] + }, + attrs={ + "num_chunk_types": self.num_chunk_types, + "chunk_scheme": self.chunk_scheme, + "excluded_chunk_types": self.excluded_chunk_types or [] + }) + return (num_infer_chunks, num_label_chunks, num_correct_chunks) + + +class LacLoss(Loss): + def __init__(self): + super(LacLoss, self).__init__() + pass + + def forward(self, outputs, labels): + avg_cost = outputs[1] + return avg_cost + + +class ChunkEval(Metric): + def __init__(self, num_labels, name=None, *args, **kwargs): + super(ChunkEval, self).__init__(*args, **kwargs) + self._init_name(name) + self.chunk_eval = Chunk_eval( + int(math.ceil((num_labels - 1) / 2.0)), "IOB") + self.reset() + + def add_metric_op(self, pred, label, *args, **kwargs): + crf_decode = pred[0] + lengths = pred[2] + (num_infer_chunks, num_label_chunks, + num_correct_chunks) = self.chunk_eval( + input=crf_decode, label=label, seq_length=lengths) + return [num_infer_chunks, num_label_chunks, num_correct_chunks] + + def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, + *args, **kwargs): + self.infer_chunks_total += num_infer_chunks + self.label_chunks_total += num_label_chunks + self.correct_chunks_total += num_correct_chunks + precision = float( + num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 + recall = float( + num_correct_chunks) / num_label_chunks if num_label_chunks else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if num_correct_chunks else 0 + return [precision, recall, f1_score] + + def reset(self): + self.infer_chunks_total = 0 + self.label_chunks_total = 0 + self.correct_chunks_total = 0 + + def accumulate(self): + precision = float( + self.correct_chunks_total + ) / self.infer_chunks_total if self.infer_chunks_total else 0 + recall = float( + self.correct_chunks_total + ) / self.label_chunks_total if self.label_chunks_total else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if self.correct_chunks_total else 0 + res = [precision, recall, f1_score] + return res + + def _init_name(self, name): + name = name or 'chunk eval' + self._name = ['precision', 'recall', 'F1'] + + def name(self): + return self._name + + +def main(args): + place = set_device(args.device) + fluid.enable_dygraph(place) if args.dynamic else None + + inputs = [ + Input( + [None, args.max_seq_len], 'int64', name='words'), Input( + [None, args.max_seq_len], 'int64', name='target'), Input( + [None], 'int64', name='length') + ] + labels = [Input([None, args.max_seq_len], 'int64', name='labels')] + + feed_list = None if args.dynamic else [x.forward() for x in inputs + labels] + dataset = LacDataset(args) + train_path = os.path.join(args.data, "train.tsv") + test_path = os.path.join(args.data, "test.tsv") + + train_generator = create_lexnet_data_generator( + args, reader=dataset, file_name=train_path, place=place, mode="train") + test_generator = create_lexnet_data_generator( + args, reader=dataset, file_name=test_path, place=place, mode="test") + + train_dataset = create_dataloader( + train_generator, place, feed_list=feed_list) + test_dataset = create_dataloader( + test_generator, place, feed_list=feed_list) + + vocab_size = dataset.vocab_size + num_labels = dataset.num_labels + model = SeqTagging(args, vocab_size, num_labels) + + optim = AdamOptimizer( + learning_rate=args.base_learning_rate, + parameter_list=model.parameters()) + + model.prepare( + optim, + LacLoss(), + ChunkEval(num_labels), + inputs=inputs, + labels=labels, + device=args.device) + + if args.resume is not None: + model.load(args.resume) + + model.fit(train_dataset, + test_dataset, + epochs=args.epoch, + batch_size=args.batch_size, + eval_freq=args.eval_freq, + save_freq=args.save_freq, + save_dir=args.save_dir) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("LAC training") + parser.add_argument( + "-dir", "--data", default=None, type=str, help='path to LAC dataset') + parser.add_argument( + "-wd", + "--word_dict_path", + default=None, + type=str, + help='word dict path') + parser.add_argument( + "-ld", + "--label_dict_path", + default=None, + type=str, + help='label dict path') + parser.add_argument( + "-wrd", + "--word_rep_dict_path", + default=None, + type=str, + help='The path of the word replacement Dictionary.') + parser.add_argument( + "-dev", + "--device", + type=str, + default='gpu', + help="device to use, gpu or cpu") + parser.add_argument( + "-d", "--dynamic", action='store_true', help="enable dygraph mode") + parser.add_argument( + "-e", "--epoch", default=10, type=int, help="number of epoch") + parser.add_argument( + '-lr', + '--base_learning_rate', + default=1e-3, + type=float, + metavar='LR', + help='initial learning rate') + parser.add_argument( + "--word_emb_dim", + default=128, + type=int, + help='word embedding dimension') + parser.add_argument( + "--grnn_hidden_dim", default=128, type=int, help="hidden dimension") + parser.add_argument( + "--bigru_num", default=2, type=int, help='the number of bi-rnn') + parser.add_argument("-elr", "--emb_learning_rate", default=1.0, type=float) + parser.add_argument("-clr", "--crf_learning_rate", default=1.0, type=float) + parser.add_argument( + "-b", "--batch_size", default=300, type=int, help="batch size") + parser.add_argument( + "--max_seq_len", default=126, type=int, help="max sequence length") + parser.add_argument( + "-n", "--num_devices", default=1, type=int, help="number of devices") + parser.add_argument( + "-r", + "--resume", + default=None, + type=str, + help="checkpoint path to resume") + parser.add_argument( + "-o", + "--save_dir", + default="./model", + type=str, + help="save model path") + parser.add_argument( + "-sf", "--save_freq", default=1, type=int, help="save frequency") + parser.add_argument( + "-ef", "--eval_freq", default=1, type=int, help="eval frequency") + + args = parser.parse_args() + print(args) + main(args) diff --git a/text.py b/text.py index 2702981d6e274ab20e250b7499b65632ffd7a3ba..880c413509cc96a17ac2ef4a1ffeea626ab4240c 100644 --- a/text.py +++ b/text.py @@ -8,7 +8,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.layers.utils as utils from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as -from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm +from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit from paddle.fluid.data_feeder import convert_dtype from paddle.fluid import layers @@ -19,8 +19,8 @@ __all__ = [ 'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode', 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', - 'TransformerDecoder', 'TransformerBeamSearchDecoder' -] + 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'DynamicGRU', 'BiGRU', + 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging'] class RNNCell(Layer): @@ -990,3 +990,299 @@ class TransformerDecoder(Layer): decoder_layer.cross_attn.cal_kv(enc_output, enc_output))) for decoder_layer in self.decoder_layers ] + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + size, + h_0=None, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + + self.gru_unit = GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + + +class BiGRU(fluid.dygraph.Layer): + def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None): + super(BiGRU, self).__init__() + + self.pre_gru = Linear( + input_dim=input_dim, + output_dim=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.gru = DynamicGRU( + size=grnn_hidden_dim, + h_0=h_0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.pre_gru_r = Linear( + input_dim=input_dim, + output_dim=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.gru_r = DynamicGRU( + size=grnn_hidden_dim, + is_reverse=False, + h_0=h_0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + def forward(self, input_feature): + res_pre_gru = self.pre_gru(input_feature) + res_gru = self.gru(res_pre_gru) + res_pre_gru_r = self.pre_gru_r(input_feature) + res_gru_r = self.gru_r(res_pre_gru_r) + bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1) + return bi_merge + + +class Linear_chain_crf(fluid.dygraph.Layer): + def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): + super(Linear_chain_crf, self).__init__() + + self._param_attr = param_attr + self._dtype = dtype + self._size = size + self._is_test = is_test + self._transition = self.create_parameter( + attr=self._param_attr, + shape=[self._size + 2, self._size], + dtype=self._dtype) + + @property + def weight(self): + return self._transition + + @weight.setter + def weight(self, value): + self._transition = value + + def forward(self, input, label, length=None): + + alpha = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + emission_exps = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + transition_exps = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + log_likelihood = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + this_inputs = { + "Emission": [input], + "Transition": self._transition, + "Label": [label] + } + if length: + this_inputs['Length'] = [length] + self._helper.append_op( + type='linear_chain_crf', + inputs=this_inputs, + outputs={ + "Alpha": [alpha], + "EmissionExps": [emission_exps], + "TransitionExps": transition_exps, + "LogLikelihood": log_likelihood + }, + attrs={"is_test": self._is_test, }) + return log_likelihood + + +class Crf_decoding(fluid.dygraph.Layer): + def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): + super(Crf_decoding, self).__init__() + + self._dtype = dtype + self._size = size + self._is_test = is_test + self._param_attr = param_attr + self._transition = self.create_parameter( + attr=self._param_attr, + shape=[self._size + 2, self._size], + dtype=self._dtype) + + @property + def weight(self): + return self._transition + + @weight.setter + def weight(self, value): + self._transition = value + + def forward(self, input, label=None, length=None): + + viterbi_path = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + this_inputs = { + "Emission": [input], + "Transition": self._transition, + "Label": label + } + if length: + this_inputs['Length'] = [length] + self._helper.append_op( + type='crf_decoding', + inputs=this_inputs, + outputs={"ViterbiPath": [viterbi_path]}, + attrs={"is_test": self._is_test, }) + return viterbi_path + + +class SequenceTagging(fluid.dygraph.Layer): + def __init__(self, + vocab_size, + num_labels, + batch_size, + word_emb_dim=128, + grnn_hidden_dim=128, + emb_learning_rate=0.1, + crf_learning_rate=0.1, + bigru_num=2, + init_bound=0.1, + length=None): + super(SequenceTagging, self).__init__() + """ + define the sequence tagging network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + self.word_emb_dim = word_emb_dim + self.vocab_size = vocab_size + self.num_labels = num_labels + self.grnn_hidden_dim = grnn_hidden_dim + self.emb_lr = emb_learning_rate + self.crf_lr = crf_learning_rate + self.bigru_num = bigru_num + self.batch_size = batch_size + self.init_bound = 0.1 + + self.word_embedding = Embedding( + size=[self.vocab_size, self.word_emb_dim], + dtype='float32', + param_attr=fluid.ParamAttr( + learning_rate=self.emb_lr, + name="word_emb", + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound))) + + h_0 = fluid.layers.create_global_var( + shape=[self.batch_size, self.grnn_hidden_dim], + value=0.0, + dtype='float32', + persistable=True, + force_cpu=True, + name='h_0') + + self.bigru_units = [] + for i in range(self.bigru_num): + if i == 0: + self.bigru_units.append( + self.add_sublayer( + "bigru_units%d" % i, + BiGRU( + self.grnn_hidden_dim, + self.grnn_hidden_dim, + self.init_bound, + h_0=h_0))) + else: + self.bigru_units.append( + self.add_sublayer( + "bigru_units%d" % i, + BiGRU( + self.grnn_hidden_dim * 2, + self.grnn_hidden_dim, + self.init_bound, + h_0=h_0))) + + self.fc = Linear( + input_dim=self.grnn_hidden_dim * 2, + output_dim=self.num_labels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.linear_chain_crf = Linear_chain_crf( + param_attr=fluid.ParamAttr( + name='linear_chain_crfw', learning_rate=self.crf_lr), + size=self.num_labels) + + self.crf_decoding = Crf_decoding( + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=self.crf_lr), + size=self.num_labels) + + def forward(self, word, target, lengths): + """ + Configure the network + """ + word_embed = self.word_embedding(word) + input_feature = word_embed + + for i in range(self.bigru_num): + bigru_output = self.bigru_units[i](input_feature) + input_feature = bigru_output + + emission = self.fc(bigru_output) + + crf_cost = self.linear_chain_crf( + input=emission, label=target, length=lengths) + avg_cost = fluid.layers.mean(x=crf_cost) + self.crf_decoding.weight = self.linear_chain_crf.weight + crf_decode = self.crf_decoding(input=emission, length=lengths) + return crf_decode, avg_cost, lengths