未验证 提交 0aeae50c 编写于 作者: P pkpk 提交者: GitHub

Merge pull request #21 from 0YuanZhang0/lac_model

add_lac_model
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
lexical analysis network structure
"""
from __future__ import division
from __future__ import print_function
import io
import os
import sys
import math
import argparse
import numpy as np
from metrics import Metric
from model import Model, Input, Loss, set_device
import paddle.fluid as fluid
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.initializer import NormalInitializer
from paddle.fluid.dygraph.nn import Embedding, Linear, GRUUnit
class DynamicGRU(fluid.dygraph.Layer):
def __init__(self,
size,
h_0=None,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res
class BiGRU(fluid.dygraph.Layer):
def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
super(BiGRU, self).__init__()
self.pre_gru = Linear(
input_dim=input_dim,
output_dim=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru = DynamicGRU(
size=grnn_hidden_dim,
h_0=h_0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.pre_gru_r = Linear(
input_dim=input_dim,
output_dim=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru_r = DynamicGRU(
size=grnn_hidden_dim,
is_reverse=True,
h_0=h_0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
def forward(self, input_feature):
res_pre_gru = self.pre_gru(input_feature)
res_gru = self.gru(res_pre_gru)
res_pre_gru_r = self.pre_gru_r(input_feature)
res_gru_r = self.gru_r(res_pre_gru_r)
bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
return bi_merge
class Linear_chain_crf(fluid.dygraph.Layer):
def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
super(Linear_chain_crf, self).__init__()
self._param_attr = param_attr
self._dtype = dtype
self._size = size
self._is_test = is_test
self._transition = self.create_parameter(
attr=self._param_attr,
shape=[self._size + 2, self._size],
dtype=self._dtype)
@property
def weight(self):
return self._transition
@weight.setter
def weight(self, value):
self._transition = value
def forward(self, input, label, length=None):
alpha = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
emission_exps = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
transition_exps = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
log_likelihood = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
this_inputs = {
"Emission": [input],
"Transition": self._transition,
"Label": [label]
}
if length:
this_inputs['Length'] = [length]
self._helper.append_op(
type='linear_chain_crf',
inputs=this_inputs,
outputs={
"Alpha": [alpha],
"EmissionExps": [emission_exps],
"TransitionExps": transition_exps,
"LogLikelihood": log_likelihood
},
attrs={"is_test": self._is_test, })
return log_likelihood
class Crf_decoding(fluid.dygraph.Layer):
def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
super(Crf_decoding, self).__init__()
self._dtype = dtype
self._size = size
self._is_test = is_test
self._param_attr = param_attr
self._transition = self.create_parameter(
attr=self._param_attr,
shape=[self._size + 2, self._size],
dtype=self._dtype)
@property
def weight(self):
return self._transition
@weight.setter
def weight(self, value):
self._transition = value
def forward(self, input, label=None, length=None):
viterbi_path = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
this_inputs = {
"Emission": [input],
"Transition": self._transition,
"Label": label
}
if length:
this_inputs['Length'] = [length]
self._helper.append_op(
type='crf_decoding',
inputs=this_inputs,
outputs={"ViterbiPath": [viterbi_path]},
attrs={"is_test": self._is_test, })
return viterbi_path
class Chunk_eval(fluid.dygraph.Layer):
def __init__(self,
num_chunk_types,
chunk_scheme,
excluded_chunk_types=None):
super(Chunk_eval, self).__init__()
self.num_chunk_types = num_chunk_types
self.chunk_scheme = chunk_scheme
self.excluded_chunk_types = excluded_chunk_types
def forward(self, input, label, seq_length=None):
precision = self._helper.create_variable_for_type_inference(
dtype="float32")
recall = self._helper.create_variable_for_type_inference(
dtype="float32")
f1_score = self._helper.create_variable_for_type_inference(
dtype="float32")
num_infer_chunks = self._helper.create_variable_for_type_inference(
dtype="int64")
num_label_chunks = self._helper.create_variable_for_type_inference(
dtype="int64")
num_correct_chunks = self._helper.create_variable_for_type_inference(
dtype="int64")
this_input = {"Inference": input, "Label": label[0]}
if seq_length:
this_input["SeqLength"] = seq_length[0]
self._helper.append_op(
type='chunk_eval',
inputs=this_input,
outputs={
"Precision": [precision],
"Recall": [recall],
"F1-Score": [f1_score],
"NumInferChunks": [num_infer_chunks],
"NumLabelChunks": [num_label_chunks],
"NumCorrectChunks": [num_correct_chunks]
},
attrs={
"num_chunk_types": self.num_chunk_types,
"chunk_scheme": self.chunk_scheme,
"excluded_chunk_types": self.excluded_chunk_types or []
})
return (num_infer_chunks, num_label_chunks, num_correct_chunks)
class LAC(Model):
def __init__(self, args, vocab_size, num_labels, length=None):
super(LAC, self).__init__()
"""
define the lexical analysis network structure
word: stores the input of the model
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
return:
for infer: return the prediction
otherwise: return the prediction
"""
self.word_emb_dim = args.word_emb_dim
self.vocab_size = vocab_size
self.num_labels = num_labels
self.grnn_hidden_dim = args.grnn_hidden_dim
self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
args) else 1.0
self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
args) else 1.0
self.bigru_num = args.bigru_num
self.init_bound = 0.1
self.word_embedding = Embedding(
size=[self.vocab_size, self.word_emb_dim],
dtype='float32',
param_attr=fluid.ParamAttr(
learning_rate=self.emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound)))
h_0 = fluid.layers.create_global_var(
shape=[args.batch_size, self.grnn_hidden_dim],
value=0.0,
dtype='float32',
persistable=True,
force_cpu=True,
name='h_0')
self.bigru_units = []
for i in range(self.bigru_num):
if i == 0:
self.bigru_units.append(
self.add_sublayer(
"bigru_units%d" % i,
BiGRU(
self.grnn_hidden_dim,
self.grnn_hidden_dim,
self.init_bound,
h_0=h_0)))
else:
self.bigru_units.append(
self.add_sublayer(
"bigru_units%d" % i,
BiGRU(
self.grnn_hidden_dim * 2,
self.grnn_hidden_dim,
self.init_bound,
h_0=h_0)))
self.fc = Linear(
input_dim=self.grnn_hidden_dim * 2,
output_dim=self.num_labels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.linear_chain_crf = Linear_chain_crf(
param_attr=fluid.ParamAttr(
name='linear_chain_crfw', learning_rate=self.crf_lr),
size=self.num_labels)
self.crf_decoding = Crf_decoding(
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=self.crf_lr),
size=self.num_labels)
def forward(self, word, target, lengths):
"""
Configure the network
"""
word_embed = self.word_embedding(word)
input_feature = word_embed
for i in range(self.bigru_num):
bigru_output = self.bigru_units[i](input_feature)
input_feature = bigru_output
emission = self.fc(bigru_output)
crf_cost = self.linear_chain_crf(
input=emission, label=target, length=lengths)
avg_cost = fluid.layers.mean(x=crf_cost)
self.crf_decoding.weight = self.linear_chain_crf.weight
crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, avg_cost, lengths
class LacLoss(Loss):
def __init__(self):
super(LacLoss, self).__init__()
pass
def forward(self, outputs, labels):
avg_cost = outputs[1]
return avg_cost
class ChunkEval(Metric):
def __init__(self, num_labels, name=None, *args, **kwargs):
super(ChunkEval, self).__init__(*args, **kwargs)
self._init_name(name)
self.chunk_eval = Chunk_eval(
int(math.ceil((num_labels - 1) / 2.0)), "IOB")
self.reset()
def add_metric_op(self, pred, label, *args, **kwargs):
crf_decode = pred[0]
lengths = pred[2]
(num_infer_chunks, num_label_chunks,
num_correct_chunks) = self.chunk_eval(
input=crf_decode, label=label, seq_length=lengths)
return [num_infer_chunks, num_label_chunks, num_correct_chunks]
def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
*args, **kwargs):
self.infer_chunks_total += num_infer_chunks
self.label_chunks_total += num_label_chunks
self.correct_chunks_total += num_correct_chunks
precision = float(
num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
recall = float(
num_correct_chunks) / num_label_chunks if num_label_chunks else 0
f1_score = float(2 * precision * recall) / (
precision + recall) if num_correct_chunks else 0
return [precision, recall, f1_score]
def reset(self):
self.infer_chunks_total = 0
self.label_chunks_total = 0
self.correct_chunks_total = 0
def accumulate(self):
precision = float(
self.correct_chunks_total
) / self.infer_chunks_total if self.infer_chunks_total else 0
recall = float(
self.correct_chunks_total
) / self.label_chunks_total if self.label_chunks_total else 0
f1_score = float(2 * precision * recall) / (
precision + recall) if self.correct_chunks_total else 0
res = [precision, recall, f1_score]
return res
def _init_name(self, name):
name = name or 'chunk eval'
self._name = ['precision', 'recall', 'F1']
def name(self):
return self._name
class LacDataset(object):
"""
Load lexical analysis dataset
"""
def __init__(self, args):
self.word_dict_path = args.word_dict_path
self.label_dict_path = args.label_dict_path
self.word_rep_dict_path = args.word_rep_dict_path
self._load_dict()
def _load_dict(self):
self.word2id_dict = self.load_kv_dict(
self.word_dict_path, reverse=True, value_func=np.int64)
self.id2word_dict = self.load_kv_dict(self.word_dict_path)
self.label2id_dict = self.load_kv_dict(
self.label_dict_path, reverse=True, value_func=np.int64)
self.id2label_dict = self.load_kv_dict(self.label_dict_path)
if self.word_rep_dict_path is None:
self.word_replace_dict = dict()
else:
self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path)
def load_kv_dict(self,
dict_path,
reverse=False,
delimiter="\t",
key_func=None,
value_func=None):
"""
Load key-value dict from file
"""
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split(delimiter)
if len(terms) != 2:
continue
if reverse:
value, key = terms
else:
key, value = terms
if key in result_dict:
raise KeyError("key duplicated with [%s]" % (key))
if key_func:
key = key_func(key)
if value_func:
value = value_func(value)
result_dict[key] = value
return result_dict
@property
def vocab_size(self):
return len(self.word2id_dict.values())
@property
def num_labels(self):
return len(self.label2id_dict.values())
def get_num_examples(self, filename):
"""num of line of file"""
return sum(1 for line in io.open(filename, "r", encoding='utf8'))
def word_to_ids(self, words):
"""convert word to word index"""
word_ids = []
for word in words:
word = self.word_replace_dict.get(word, word)
if word not in self.word2id_dict:
word = "OOV"
word_id = self.word2id_dict[word]
word_ids.append(word_id)
return word_ids
def label_to_ids(self, labels):
"""convert label to label index"""
label_ids = []
for label in labels:
if label not in self.label2id_dict:
label = "O"
label_id = self.label2id_dict[label]
label_ids.append(label_id)
return label_ids
def file_reader(self,
filename,
mode="train",
batch_size=32,
max_seq_len=126):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def wrapper():
fread = io.open(filename, "r", encoding="utf-8")
headline = next(fread)
headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[0] == "text_a" and headline[
1] == "label"
buf = []
for line in fread:
words, labels = line.strip("\n").split("\t")
if len(words) < 1:
continue
word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids)
word_ids = word_ids[0:max_seq_len]
words_len = np.int64(len(word_ids))
word_ids += [0 for _ in range(max_seq_len - words_len)]
label_ids = label_ids[0:max_seq_len]
label_ids += [0 for _ in range(max_seq_len - words_len)]
assert len(word_ids) == len(label_ids)
yield word_ids, label_ids, words_len
fread.close()
return wrapper
def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def wrapper():
batch_words, batch_labels, seq_lens = [], [], []
for epoch in xrange(args.epoch):
for instance in reader.file_reader(
file_name, mode, max_seq_len=args.max_seq_len)():
words, labels, words_len = instance
if len(seq_lens) < args.batch_size:
batch_words.append(words)
batch_labels.append(labels)
seq_lens.append(words_len)
if len(seq_lens) == args.batch_size:
yield batch_words, batch_labels, seq_lens, batch_labels
batch_words, batch_labels, seq_lens = [], [], []
if len(seq_lens) > 0:
yield batch_words, batch_labels, seq_lens, batch_labels
batch_words, batch_labels, seq_lens = [], [], []
return wrapper
def create_dataloader(generator, place, feed_list=None):
if not feed_list:
data_loader = fluid.io.DataLoader.from_generator(
capacity=50,
use_double_buffer=True,
iterable=True,
return_list=True)
else:
data_loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=50,
use_double_buffer=True,
iterable=True,
return_list=True)
data_loader.set_batch_generator(generator, places=place)
return data_loader
def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None
inputs = [
Input(
[None, args.max_seq_len], 'int64', name='words'), Input(
[None, args.max_seq_len], 'int64', name='target'), Input(
[None], 'int64', name='length')
]
labels = [Input([None, args.max_seq_len], 'int64', name='labels')]
feed = [x.forward() for x in inputs + labels]
dataset = LacDataset(args)
train_path = os.path.join(args.data, "train.tsv")
test_path = os.path.join(args.data, "test.tsv")
if args.dynamic:
feed_list = None
else:
feed_list = feed
train_generator = create_lexnet_data_generator(
args, reader=dataset, file_name=train_path, place=place, mode="train")
test_generator = create_lexnet_data_generator(
args, reader=dataset, file_name=test_path, place=place, mode="test")
train_dataset = create_dataloader(
train_generator, place, feed_list=feed_list)
test_dataset = create_dataloader(
test_generator, place, feed_list=feed_list)
vocab_size = dataset.vocab_size
num_labels = dataset.num_labels
model = LAC(args, vocab_size, num_labels)
optim = AdamOptimizer(
learning_rate=args.base_learning_rate,
parameter_list=model.parameters())
model.prepare(
optim,
LacLoss(),
ChunkEval(num_labels),
inputs=inputs,
labels=labels,
device=args.device)
if args.resume is not None:
model.load(args.resume)
model.fit(train_dataset,
test_dataset,
epochs=args.epoch,
batch_size=args.batch_size,
eval_freq=args.eval_freq,
save_freq=args.save_freq,
save_dir=args.save_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser("LAC training")
parser.add_argument(
"-dir", "--data", default=None, type=str, help='path to LAC dataset')
parser.add_argument(
"-wd",
"--word_dict_path",
default=None,
type=str,
help='word dict path')
parser.add_argument(
"-ld",
"--label_dict_path",
default=None,
type=str,
help='label dict path')
parser.add_argument(
"-wrd",
"--word_rep_dict_path",
default=None,
type=str,
help='The path of the word replacement Dictionary.')
parser.add_argument(
"-dev",
"--device",
type=str,
default='gpu',
help="device to use, gpu or cpu")
parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument(
"-e", "--epoch", default=10, type=int, help="number of epoch")
parser.add_argument(
'-lr',
'--base_learning_rate',
default=1e-3,
type=float,
metavar='LR',
help='initial learning rate')
parser.add_argument(
"--word_emb_dim",
default=128,
type=int,
help='word embedding dimension')
parser.add_argument(
"--grnn_hidden_dim", default=128, type=int, help="hidden dimension")
parser.add_argument(
"--bigru_num", default=2, type=int, help='the number of bi-rnn')
parser.add_argument("-elr", "--emb_learning_rate", default=1.0, type=float)
parser.add_argument("-clr", "--crf_learning_rate", default=1.0, type=float)
parser.add_argument(
"-b", "--batch_size", default=300, type=int, help="batch size")
parser.add_argument(
"--max_seq_len", default=126, type=int, help="max sequence length")
parser.add_argument(
"-n", "--num_devices", default=1, type=int, help="number of devices")
parser.add_argument(
"-r",
"--resume",
default=None,
type=str,
help="checkpoint path to resume")
parser.add_argument(
"-o",
"--save_dir",
default="./model",
type=str,
help="save model path")
parser.add_argument(
"-sf", "--save_freq", default=1, type=int, help="save frequency")
parser.add_argument(
"-ef", "--eval_freq", default=1, type=int, help="eval frequency")
args = parser.parse_args()
print(args)
main(args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册