From b6bd7386b481ffe16d5ad5025d627e6cf0de9a19 Mon Sep 17 00:00:00 2001 From: JesseyXujin <516770320@qq.com> Date: Wed, 13 Nov 2019 12:50:46 +0000 Subject: [PATCH] add lac in dygraph --- dygraph/lexical_analysis/args.yaml | 83 +++++++++++++ dygraph/lexical_analysis/main.py | 171 ++++++++++++++++++++++++++ dygraph/lexical_analysis/nets.py | 178 +++++++++++++++++++++++++++ dygraph/lexical_analysis/reader.py | 186 +++++++++++++++++++++++++++++ dygraph/lexical_analysis/utils.py | 75 ++++++++++++ 5 files changed, 693 insertions(+) create mode 100644 dygraph/lexical_analysis/args.yaml create mode 100644 dygraph/lexical_analysis/main.py create mode 100644 dygraph/lexical_analysis/nets.py create mode 100644 dygraph/lexical_analysis/reader.py create mode 100644 dygraph/lexical_analysis/utils.py diff --git a/dygraph/lexical_analysis/args.yaml b/dygraph/lexical_analysis/args.yaml new file mode 100644 index 00000000..84ed8576 --- /dev/null +++ b/dygraph/lexical_analysis/args.yaml @@ -0,0 +1,83 @@ +model: + word_emb_dim: + val: 128 + meaning: "The dimension in which a word is embedded." + grnn_hidden_dim: + val: 128 + meaning: "The number of hidden nodes in the GRNN layer." + bigru_num: + val: 2 + meaning: "The number of bi_gru layers in the network." + init_checkpoint: + val: "" + meaning: "Path to init model" + inference_save_dir: + val: "" + meaning: "Path to save inference model" + +train: + random_seed: + val: 0 + meaning: "Random seed for training" + print_steps: + val: 1 + meaning: "Print the result per xxx batch of training" + save_steps: + val: 10 + meaning: "Save the model once per xxxx batch of training" + validation_steps: + val: 10 + meaning: "Do the validation once per xxxx batch of training" + batch_size: + val: 100 + meaning: "The number of sequences contained in a mini-batch" + epoch: + val: 10 + meaning: "Corpus iteration num" + use_cuda: + val: True + meaning: "If set, use GPU for training." + traindata_shuffle_buffer: + val: 20000 + meaning: "The buffer size used in shuffle the training data." + base_learning_rate: + val: 0.001 + meaning: "The basic learning rate that affects the entire network." + emb_learning_rate: + val: 2 + meaning: "The real learning rate of the embedding layer will be (emb_learning_rate * base_learning_rate)." + crf_learning_rate: + val: 0.2 + meaning: "The real learning rate of the embedding layer will be (crf_learning_rate * base_learning_rate)." + enable_ce: + val: false + meaning: 'If set, run the task with continuous evaluation logs.' + cpu_num: + val: 10 + meaning: "The number of cpu used to train model, this argument wouldn't be valid if use_cuda=true" + +data: + word_dict_path: + val: "./conf/word.dic" + meaning: "The path of the word dictionary." + label_dict_path: + val: "./conf/tag.dic" + meaning: "The path of the label dictionary." + word_rep_dict_path: + val: "./conf/q2b.dic" + meaning: "The path of the word replacement Dictionary." + train_data: + val: "./data/train.tsv" + meaning: "The folder where the training data is located." + test_data: + val: "./data/test.tsv" + meaning: "The folder where the test data is located." + infer_data: + val: "./data/infer.tsv" + meaning: "The folder where the infer data is located." + model_save_dir: + val: "./models" + meaning: "The model will be saved in this path." + max_seq_lens: + val: 65 + meaning: "The max sentence lengths of data" \ No newline at end of file diff --git a/dygraph/lexical_analysis/main.py b/dygraph/lexical_analysis/main.py new file mode 100644 index 00000000..61ddc837 --- /dev/null +++ b/dygraph/lexical_analysis/main.py @@ -0,0 +1,171 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import time +import argparse +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.base import to_variable +import nets +import reader +import utils + + +def train(args, place): + with fluid.dygraph.guard(place): + + dataset = reader.Dataset(args) + num_train_examples = dataset.get_num_examples(args.train_data) + + max_train_steps = args.epoch * num_train_examples // args.batch_size + + #define reader + train_processor = reader.LACProcessor(args, args.train_data, + args.word_dict_path) + test_processor = dataset.file_reader(args.test_data, mode="test") + + #define network + model = nets.LAC("lac_net", args, dataset.vocab_size, args.batch_size, + args.max_seq_lens) + + sgd_optimizer = fluid.optimizer.Adagrad( + learning_rate=args.base_learning_rate) + steps = 0 + total_cost, total_acc, total_num_seqs = [], [], [] + for eop in range(args.epoch): + time_begin = time.time() + for data in train_processor.data_generator("train")(): + steps += 1 + doc = to_variable( + np.array([ + np.pad(x[0][0:args.max_seq_lens], ( + 0, args.max_seq_lens - len(x[0][ + 0:args.max_seq_lens])), + 'constant', + constant_values=(dataset.vocab_size)) + for x in data + ]).astype('int64').reshape(-1, 1)) + + seq_lens = to_variable( + np.array([len(x[0]) for x in data]).astype('int64')) + targets = to_variable( + np.array([ + np.pad(x[1][0:args.max_seq_lens], ( + 0, args.max_seq_lens - len(x[1][ + 0:args.max_seq_lens])), + 'constant', + constant_values=(dataset.num_labels)) + for x in data + ]).astype('int64')) + + model.train() + avg_cost, prediction, acc = model(doc, targets, seq_lens) + avg_cost.backward() + np_mask = (doc.numpy() != dataset.vocab_size).astype('int32') + word_num = np.sum(np_mask) + sgd_optimizer.minimize(avg_cost) + model.clear_gradients() + total_cost.append(avg_cost.numpy() * word_num) + total_acc.append(acc.numpy() * word_num) + total_num_seqs.append(word_num) + + if steps % args.skip_steps == 0: + time_end = time.time() + used_time = time_end - time_begin + print("step: %d, ave loss: %f, " + "ave acc: %f, speed: %f steps/s" % + (steps, np.sum(total_cost) / np.sum(total_num_seqs), + np.sum(total_acc) / np.sum(total_num_seqs), + args.skip_steps / used_time)) + total_cost, total_acc, total_num_seqs = [], [], [] + time_begin = time.time() + + if steps % args.validation_steps == 0: + total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], [] + model.eval() + eval_steps = 0 + for data in train_processor.data_generator("train")(): + steps += 1 + eval_doc = to_variable( + np.array([ + np.pad(x[0][0:args.max_seq_lens], ( + 0, args.max_seq_lens - len(x[0][ + 0:args.max_seq_lens])), + 'constant', + constant_values=(dataset.vocab_size)) + for x in data + ]).astype('int64').reshape(-1, 1)) + + eval_seq_lens = to_variable( + np.array([len(x[0]) for x in data]).astype('int64') + .reshape(args.batch_size, 1)) + + eval_targets = to_variable( + np.array([ + np.pad(x[1][0:args.max_seq_lens], ( + 0, args.max_seq_lens - len(x[1][ + 0:args.max_seq_lens])), + 'constant', + constant_values=(dataset.num_labels)) + for x in data + ]).astype('int64')) + + eval_avg_cost, eval_prediction, eval_acc = model( + eval_doc, eval_targets, eval_seq_lens) + eval_np_mask = ( + eval_np_doc != dataset.vocab_size).astype('int32') + eval_word_num = np.sum(eval_np_mask) + total_eval_cost.append(eval_avg_cost.numpy() * + eval_word_num) + total_eval_acc.append(eval_acc.numpy() * eval_word_num) + total_eval_num_seqs.append(eval_word_num) + eval_steps += 1 + + time_end = time.time() + used_time = time_end - time_begin + print("Final validation result: step: %d, ave loss: %f, " + "ave acc: %f, speed: %f steps/s" % + (steps, np.sum(total_eval_cost) / + np.sum(total_eval_num_seqs), np.sum(total_eval_acc) / + np.sum(total_eval_num_seqs), eval_steps / used_time)) + time_begin = time.time() + if args.ce: + print("kpis\ttrain_loss\t%0.3f" % + (np.sum(total_eval_cost) / + np.sum(total_eval_num_seqs))) + print("kpis\ttrain_acc\t%0.3f" % + (np.sum(total_eval_acc) / + np.sum(total_eval_num_seqs))) + + if steps % args.save_steps == 0: + save_path = "save_dir_" + str(steps) + print('save model to: ' + save_path) + fluid.dygraph.save_dygraph(model.state_dict(), + save_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(__doc__) + utils.load_yaml(parser, 'args.yaml') + args = parser.parse_args() + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = 1 + print(args) + train(args, place) diff --git a/dygraph/lexical_analysis/nets.py b/dygraph/lexical_analysis/nets.py new file mode 100644 index 00000000..ce211367 --- /dev/null +++ b/dygraph/lexical_analysis/nets.py @@ -0,0 +1,178 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, Embedding +from paddle.fluid.dygraph import GRUUnit +from paddle.fluid.dygraph.base import to_variable +import numpy as np + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + scope_name, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__(scope_name) + self.gru_unit = GRUUnit( + self.full_name(), + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + + +class LAC(fluid.dygraph.Layer): + def __init__(self, + name_scope, + args, + vocab_size, + num_labels, + for_infer=True, + target=None): + super(LAC, self).__init__(name_scope) + self.word_emb_dim = args.word_emb_dim + self.dict_dim = vocab_size + self.grnn_hidden_dim = args.grnn_hidden_dim + self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( + args) else 1.0 + self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( + args) else 1.0 + self.bigru_num = args.bigru_num + self.init_bound = 0.1 + self.IS_SPARSE = True + self.max_seq_lens = args.max_seq_lens + self.grnn_hidden_dim = args.grnn_hidden_dim + self._word_embedding = Embedding( + self.full_name(), + size=[vocab_size, self.word_emb_dim], + dtype='float32', + is_sparse=self.IS_SPARSE, + param_attr=fluid.ParamAttr( + learning_rate=self.emb_lr, + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound))) + self._emission_fc = FC( + self.full_name(), + size=num_labels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + def _bigru_layer(input_feature, grnn_hidden_dim): + """ + define the bidirectional gru layer + """ + pre_gru = FC(input=input_feature, + size=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + gru = DynamicGRU( + input=pre_gru, + size=grnn_hidden_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + pre_gru_r = FC(input=input_feature, + size=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + gru_r = DynamicGRU( + input=pre_gru_r, + size=grnn_hidden_dim, + is_reverse=True, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1) + return bi_merge + + def forward(self, inputs, targets, seq_lens): + emb = self._word_embedding(inputs) + o_np_mask = (inputs.numpy() != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.word_emb_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape( + emb, shape=[-1, 1, self.max_seq_lens, self.hid_dim]) + input_feature = emb + for i in range(self.bigru_num): + bigru_output = _bigru_layer(input_feature, self._grnn_hidden_dim) + input_feature = bigru_output + emission = self_emission_fc(input_feature) + + if targets is not None: + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=crf_lr), + length=seq_lens) + avg_cost = fluid.layers.mean(x=crf_cost) + crf_decode = fluid.layers.crf_decoding( + input=emission, + param_attr=fluid.ParamAttr(name='crfw'), + length=seq_lens) + return avg_cost, crf_decode + + else: + size = emission.shape[1] + fluid.layers.create_parameter( + shape=[size + 2, size], dtype=emission.dtype, name='crfw') + crf_decode = fluid.layers.crf_decoding( + input=emission, + param_attr=fluid.ParamAttr(name='crfw'), + length=seq_lens) + return crf_decode diff --git a/dygraph/lexical_analysis/reader.py b/dygraph/lexical_analysis/reader.py new file mode 100644 index 00000000..d4ce0616 --- /dev/null +++ b/dygraph/lexical_analysis/reader.py @@ -0,0 +1,186 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The file_reader converts raw corpus to input. +""" + +import os +import argparse +import __future__ +import io +import glob +import paddle + + +def load_kv_dict(dict_path, + reverse=False, + delimiter="\t", + key_func=None, + value_func=None): + """ + Load key-value dict from file + """ + result_dict = {} + for line in io.open(dict_path, "r", encoding='utf8'): + terms = line.strip("\n").split(delimiter) + if len(terms) != 2: + continue + if reverse: + value, key = terms + else: + key, value = terms + if key in result_dict: + raise KeyError("key duplicated with [%s]" % (key)) + if key_func: + key = key_func(key) + if value_func: + value = value_func(value) + result_dict[key] = value + return result_dict + + +class Dataset(object): + """data reader""" + + def __init__(self, args, mode="train"): + # read dict + self.word2id_dict = load_kv_dict( + args.word_dict_path, reverse=True, value_func=int) + self.id2word_dict = load_kv_dict(args.word_dict_path) + self.label2id_dict = load_kv_dict( + args.label_dict_path, reverse=True, value_func=int) + self.id2label_dict = load_kv_dict(args.label_dict_path) + self.word_replace_dict = load_kv_dict(args.word_rep_dict_path) + + @property + def vocab_size(self): + """vocabuary size""" + return max(self.word2id_dict.values()) + 1 + + @property + def num_labels(self): + """num_labels""" + return max(self.label2id_dict.values()) + 1 + + def get_num_examples(self, filename): + """num of line of file""" + return sum(1 for line in io.open(filename, "r", encoding='utf8')) + + def word_to_ids(self, words): + """convert word to word index""" + word_ids = [] + for word in words: + word = self.word_replace_dict.get(word, word) + if word not in self.word2id_dict: + word = "OOV" + word_id = self.word2id_dict[word] + word_ids.append(word_id) + + return word_ids + + def label_to_ids(self, labels): + """convert label to label index""" + label_ids = [] + for label in labels: + if label not in self.label2id_dict: + label = "O" + label_id = self.label2id_dict[label] + label_ids.append(label_id) + return label_ids + + def file_reader(self, filename, max_seq_len=64, mode="train"): + """ + yield (word_idx, target_idx) one by one from file, + or yield (word_idx, ) in `infer` mode + """ + + def wrapper(): + fread = io.open(filename, "r", encoding="utf-8") + if mode == "infer": + for line in fread: + words = line.strip() + word_ids = self.word_to_ids(words) + yield (word_ids[0:max_seq_len], ) + else: + headline = next(fread) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[ + 0] == "text_a" and headline[1] == "label" + for line in fread: + words, labels = line.strip("\n").split("\t") + if len(words) < 1: + continue + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + yield word_ids[0:max_seq_len], label_ids[0:max_seq_len] + fread.close() + + return wrapper + + +class LACProcessor(object): + def __init__(self, args, data_dir, vocab_path, random_seed=None): + self.num_examples = {"train": -1, "dev": -1, "infer": -1} + self.args = args + self.dataset = Dataset(args) + self.data_dir = data_dir + + def get_train_examples(self, data_dir): + return self.dataset.file_reader(self.data_dir, 65, mode="train") + + def get_dev_examples(self, data_dir): + return self.dataset.file_reader(self.data_dir, 65, mode="dev") + + def get_test_examples(self, data_dir): + return self.dataset.file_reader(self.data_dir, 65, mode="test") + + def data_generator(self, mode='train', epoch=1, shuffle=True): + if mode == "train": + return paddle.batch( + self.get_train_examples(self.data_dir), 300, drop_last=True) + elif mode == "dev": + return paddle.batch( + self.get_dev_examples(self.data_dir), 300, drop_last=True) + elif mode == "infer": + return paddle.batch( + self.get_test_examples(self.data_dir), 300, drop_last=True) + else: + raise ValueError( + "Unknown phase, which should be in ['train', 'dev', 'infer'].") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(__doc__) + parser.add_argument( + "--word_dict_path", + type=str, + default="./conf/word.dic", + help="word dict") + parser.add_argument( + "--label_dict_path", + type=str, + default="./conf/tag.dic", + help="label dict") + parser.add_argument( + "--word_rep_dict_path", + type=str, + default="./conf/q2b.dic", + help="word replace dict") + args = parser.parse_args() + dataset = Dataset(args) + processor = LACProcessor(args, "data/train.tsv", args.word_dict_path) + for data in processor.data_generator("train")(): + for xx in data: + print(xx) diff --git a/dygraph/lexical_analysis/utils.py b/dygraph/lexical_analysis/utils.py new file mode 100644 index 00000000..fa3bfba0 --- /dev/null +++ b/dygraph/lexical_analysis/utils.py @@ -0,0 +1,75 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +util tools +""" +from __future__ import print_function +import os +import sys +import numpy as np +import paddle.fluid as fluid +import yaml +import io + + +def str2bool(v): + """ + argparse does not support True or False in python + """ + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + """ + Put arguments to one group + """ + + def __init__(self, parser, title, des): + """none""" + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, **kwargs): + """ Add argument """ + type = str2bool if type == bool else type + self._group.add_argument( + "--" + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def load_yaml(parser, file_name, **kwargs): + with io.open(file_name, 'r', encoding='utf8') as f: + args = yaml.load(f) + for title in args: + group = parser.add_argument_group(title=title, description='') + for name in args[title]: + _type = type(args[title][name]['val']) + _type = str2bool if _type == bool else _type + group.add_argument( + "--" + name, + default=args[title][name]['val'], + type=_type, + help=args[title][name]['meaning'] + + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + """none""" + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).items()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') -- GitLab