提交 3eda4ade 编写于 作者: xujinanne's avatar xujinanne

add lac

上级 9ce0dcb3
model:
word_emb_dim:
val: 128
meaning: "The dimension in which a word is embedded."
grnn_hidden_dim:
val: 128
meaning: "The number of hidden nodes in the GRNN layer."
bigru_num:
val: 2
meaning: "The number of bi_gru layers in the network."
init_checkpoint:
val: ""
meaning: "Path to init model"
inference_save_dir:
val: ""
meaning: "Path to save inference model"
train:
random_seed:
val: 0
meaning: "Random seed for training"
print_steps:
val: 1
meaning: "Print the result per xxx batch of training"
save_steps:
val: 10
meaning: "Save the model once per xxxx batch of training"
validation_steps:
val: 10
meaning: "Do the validation once per xxxx batch of training"
batch_size:
val: 100
meaning: "The number of sequences contained in a mini-batch"
epoch:
val: 10
meaning: "Corpus iteration num"
use_cuda:
val: True
meaning: "If set, use GPU for training."
traindata_shuffle_buffer:
val: 20000
meaning: "The buffer size used in shuffle the training data."
base_learning_rate:
val: 0.001
meaning: "The basic learning rate that affects the entire network."
emb_learning_rate:
val: 2
meaning: "The real learning rate of the embedding layer will be (emb_learning_rate * base_learning_rate)."
crf_learning_rate:
val: 0.2
meaning: "The real learning rate of the embedding layer will be (crf_learning_rate * base_learning_rate)."
enable_ce:
val: false
meaning: 'If set, run the task with continuous evaluation logs.'
cpu_num:
val: 10
meaning: "The number of cpu used to train model, this argument wouldn't be valid if use_cuda=true"
data:
word_dict_path:
val: "./conf/word.dic"
meaning: "The path of the word dictionary."
label_dict_path:
val: "./conf/tag.dic"
meaning: "The path of the label dictionary."
word_rep_dict_path:
val: "./conf/q2b.dic"
meaning: "The path of the word replacement Dictionary."
train_data:
val: "./data/train.tsv"
meaning: "The folder where the training data is located."
test_data:
val: "./data/test.tsv"
meaning: "The folder where the test data is located."
infer_data:
val: "./data/infer.tsv"
meaning: "The folder where the infer data is located."
model_save_dir:
val: "./models"
meaning: "The model will be saved in this path."
max_seq_lens:
val: 65
meaning: "The max sentence lengths of data"
\ No newline at end of file
model:
word_emb_dim:
val: 128
meaning: "The dimension in which a word is embedded."
grnn_hidden_dim:
val: 128
meaning: "The number of hidden nodes in the GRNN layer."
bigru_num:
val: 2
meaning: "The number of bi_gru layers in the network."
init_checkpoint:
val: ""
meaning: "Path to init model"
inference_save_dir:
val: ""
meaning: "Path to save inference model"
train:
random_seed:
val: 0
meaning: "Random seed for training"
print_steps:
val: 1
meaning: "Print the result per xxx batch of training"
save_steps:
val: 10
meaning: "Save the model once per xxxx batch of training"
validation_steps:
val: 10
meaning: "Do the validation once per xxxx batch of training"
batch_size:
val: 300
meaning: "The number of sequences contained in a mini-batch"
epoch:
val: 10
meaning: "Corpus iteration num"
use_cuda:
val: False
meaning: "If set, use GPU for training."
traindata_shuffle_buffer:
val: 20000
meaning: "The buffer size used in shuffle the training data."
base_learning_rate:
val: 0.001
meaning: "The basic learning rate that affects the entire network."
emb_learning_rate:
val: 2
meaning: "The real learning rate of the embedding layer will be (emb_learning_rate * base_learning_rate)."
crf_learning_rate:
val: 0.2
meaning: "The real learning rate of the embedding layer will be (crf_learning_rate * base_learning_rate)."
enable_ce:
val: false
meaning: 'If set, run the task with continuous evaluation logs.'
cpu_num:
val: 10
meaning: "The number of cpu used to train model, this argument wouldn't be valid if use_cuda=true"
data:
word_dict_path:
val: "./conf/word.dic"
meaning: "The path of the word dictionary."
label_dict_path:
val: "./conf/tag.dic"
meaning: "The path of the label dictionary."
word_rep_dict_path:
val: "./conf/q2b.dic"
meaning: "The path of the word replacement Dictionary."
train_data:
val: "./data/train.tsv"
meaning: "The folder where the training data is located."
test_data:
val: "./data/test.tsv"
meaning: "The folder where the test data is located."
infer_data:
val: "./data/infer.tsv"
meaning: "The folder where the infer data is located."
model_save_dir:
val: "./models"
meaning: "The model will be saved in this path."
model:
ernie_config_path:
val: "../LARK/ERNIE/config/ernie_config.json"
meaning: "Path to the json file for ernie model config."
init_checkpoint:
val: ""
meaning: "Path to init model"
mode:
val: "train"
meaning: "Setting to train or eval or infer"
init_pretraining_params:
val: "pretrained/params/"
meaning: "Init pre-training params which preforms fine-tuning from. If the arg 'init_checkpoint' has been set, this argument wouldn't be valid."
train:
random_seed:
val: 0
meaning: "Random seed for training"
batch_size:
val: 10
meaning: "The number of sequences contained in a mini-batch"
epoch:
val: 10
meaning: "Corpus iteration num"
use_cuda:
val: True
meaning: "If set, use GPU for training."
base_learning_rate:
val: 0.0002
meaning: "The basic learning rate that affects the entire network."
init_bound:
val: 0.1
meaning: "init bound for initialization."
crf_learning_rate:
val: 0.2
meaning: "The real learning rate of the embedding layer will be (crf_learning_rate * base_learning_rate)."
cpu_num:
val: 10
meaning: "The number of cpu used to train model, it works when use_cuda=False"
print_steps:
val: 1
meaning: "Print the result per xxx batch of training"
save_steps:
val: 10
meaning: "Save the model once per xxxx batch of training"
validation_steps:
val: 5
meaning: "Do the validation once per xxxx batch of training"
data:
vocab_path:
val: "../LARK/ERNIE/config/vocab.txt"
meaning: "The path of the vocabulary."
label_map_config:
val: "./conf/label_map.json"
meaning: "The path of the label dictionary."
num_labels:
val: 57
meaning: "label number"
max_seq_len:
val: 128
meaning: "Number of words of the longest seqence."
do_lower_case:
val: True
meaning: "Whether to lower case the input text. Should be True for uncased models and False for cased models."
train_data:
val: "./data/train.tsv"
meaning: "The folder where the training data is located."
test_data:
val: "./data/test.tsv"
meaning: "The folder where the test data is located."
infer_data:
val: "./data/test.tsv"
meaning: "The folder where the infer data is located."
model_save_dir:
val: "./ernie_models"
meaning: "The model will be saved in this path."
{"d-B": 8, "c-I": 7, "PER-I": 49, "nr-B": 16, "u-B": 36, "c-B": 6, "nr-I": 17, "an-I": 5, "ns-B": 18, "vn-I": 43, "w-B": 44, "an-B": 4, "PER-B": 48, "vn-B": 42, "ns-I": 19, "a-I": 1, "r-B": 30, "xc-B": 46, "LOC-B": 50, "ad-I": 3, "nz-B": 24, "u-I": 37, "a-B": 0, "ad-B": 2, "vd-I": 41, "nw-B": 22, "m-I": 13, "d-I": 9, "n-B": 14, "nz-I": 25, "vd-B": 40, "nw-I": 23, "n-I": 15, "nt-B": 20, "ORG-I": 53, "nt-I": 21, "ORG-B": 52, "LOC-I": 51, "t-B": 34, "TIME-I": 55, "O": 56, "s-I": 33, "f-I": 11, "TIME-B": 54, "t-I": 35, "f-B": 10, "s-B": 32, "r-I": 31, "q-B": 28, "v-I": 39, "v-B": 38, "w-I": 45, "q-I": 29, "p-B": 26, "xc-I": 47, "m-B": 12, "p-I": 27}
\ No newline at end of file
 
、 ,
。 .
— -
~ ~
‖ |
… .
‘ '
’ '
“ "
” "
〔 (
〕 )
〈 <
〉 >
「 '
」 '
『 "
』 "
〖 [
〗 ]
【 [
】 ]
∶ :
$ $
! !
" "
# #
% %
& &
' '
( (
) )
* *
+ +
, ,
- -
. .
/ /
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
: :
; ;
< <
= =
> >
? ?
@ @
A a
B b
C c
D d
E e
F f
G g
H h
I i
J j
K k
L l
M m
N n
O o
P p
Q q
R r
S s
T t
U u
V v
W w
X x
Y y
Z z
[ [
\ \
] ]
^ ^
_ _
` `
a a
b b
c c
d d
e e
f f
g g
h h
i i
j j
k k
l l
m m
n n
o o
p p
q q
r r
s s
t t
u u
v v
w w
x x
y y
z z
{ {
| |
} }
 ̄ ~
〝 "
〞 "
﹐ ,
﹑ ,
﹒ .
﹔ ;
﹕ :
﹖ ?
﹗ !
﹙ (
﹚ )
﹛ {
﹜ {
﹝ [
﹞ ]
﹟ #
﹠ &
﹡ *
﹢ +
﹣ -
﹤ <
﹥ >
﹦ =
﹨ \
﹩ $
﹪ %
﹫ @
,
A a
B b
C c
D d
E e
F f
G g
H h
I i
J j
K k
L l
M m
N n
O o
P p
Q q
R r
S s
T t
U u
V v
W w
X x
Y y
Z z
0 a-B
1 a-I
2 ad-B
3 ad-I
4 an-B
5 an-I
6 c-B
7 c-I
8 d-B
9 d-I
10 f-B
11 f-I
12 m-B
13 m-I
14 n-B
15 n-I
16 nr-B
17 nr-I
18 ns-B
19 ns-I
20 nt-B
21 nt-I
22 nw-B
23 nw-I
24 nz-B
25 nz-I
26 p-B
27 p-I
28 q-B
29 q-I
30 r-B
31 r-I
32 s-B
33 s-I
34 t-B
35 t-I
36 u-B
37 u-I
38 v-B
39 v-I
40 vd-B
41 vd-I
42 vn-B
43 vn-I
44 w-B
45 w-I
46 xc-B
47 xc-I
48 PER-B
49 PER-I
50 LOC-B
51 LOC-I
52 ORG-B
53 ORG-I
54 TIME-B
55 TIME-I
56 O
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import nets
import reader
import utils
def train(args, place):
with fluid.dygraph.guard(place):
dataset = reader.Dataset(args)
num_train_examples = dataset.get_num_examples(args.train_data)
max_train_steps = args.epoch * num_train_examples // args.batch_size
#define reader
train_processor = reader.LACProcessor(args, args.train_data,
args.word_dict_path)
test_processor = dataset.file_reader(args.test_data, mode="test")
#define network
model = nets.LAC("lac_net", args, dataset.vocab_size, args.batch_size,
args.max_seq_lens)
sgd_optimizer = fluid.optimizer.Adagrad(
learning_rate=args.base_learning_rate)
steps = 0
total_cost, total_acc, total_num_seqs = [], [], []
for eop in range(args.epoch):
time_begin = time.time()
for data in train_processor.data_generator("train")():
steps += 1
doc = to_variable(
np.array([
np.pad(x[0][0:args.max_seq_lens], (
0, args.max_seq_lens - len(x[0][
0:args.max_seq_lens])),
'constant',
constant_values=(dataset.vocab_size))
for x in data
]).astype('int64').reshape(-1, 1))
seq_lens = to_variable(
np.array([len(x[0]) for x in data]).astype('int64'))
targets = to_variable(
np.array([
np.pad(x[1][0:args.max_seq_lens], (
0, args.max_seq_lens - len(x[1][
0:args.max_seq_lens])),
'constant',
constant_values=(dataset.num_labels))
for x in data
]).astype('int64'))
model.train()
avg_cost, prediction, acc = model(doc, targets, seq_lens)
avg_cost.backward()
np_mask = (doc.numpy() != dataset.vocab_size).astype('int32')
word_num = np.sum(np_mask)
sgd_optimizer.minimize(avg_cost)
model.clear_gradients()
total_cost.append(avg_cost.numpy() * word_num)
total_acc.append(acc.numpy() * word_num)
total_num_seqs.append(word_num)
if steps % args.skip_steps == 0:
time_end = time.time()
used_time = time_end - time_begin
print("step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s" %
(steps, np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time()
if steps % args.validation_steps == 0:
total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], []
model.eval()
eval_steps = 0
for data in train_processor.data_generator("train")():
steps += 1
eval_doc = to_variable(
np.array([
np.pad(x[0][0:args.max_seq_lens], (
0, args.max_seq_lens - len(x[0][
0:args.max_seq_lens])),
'constant',
constant_values=(dataset.vocab_size))
for x in data
]).astype('int64').reshape(-1, 1))
eval_seq_lens = to_variable(
np.array([len(x[0]) for x in data]).astype('int64')
.reshape(args.batch_size, 1))
eval_targets = to_variable(
np.array([
np.pad(x[1][0:args.max_seq_lens], (
0, args.max_seq_lens - len(x[1][
0:args.max_seq_lens])),
'constant',
constant_values=(dataset.num_labels))
for x in data
]).astype('int64'))
eval_avg_cost, eval_prediction, eval_acc = model(
eval_doc, eval_targets, eval_seq_lens)
eval_np_mask = (
eval_np_doc != dataset.vocab_size).astype('int32')
eval_word_num = np.sum(eval_np_mask)
total_eval_cost.append(eval_avg_cost.numpy() *
eval_word_num)
total_eval_acc.append(eval_acc.numpy() * eval_word_num)
total_eval_num_seqs.append(eval_word_num)
eval_steps += 1
time_end = time.time()
used_time = time_end - time_begin
print("Final validation result: step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s" %
(steps, np.sum(total_eval_cost) /
np.sum(total_eval_num_seqs), np.sum(total_eval_acc) /
np.sum(total_eval_num_seqs), eval_steps / used_time))
time_begin = time.time()
if args.ce:
print("kpis\ttrain_loss\t%0.3f" %
(np.sum(total_eval_cost) /
np.sum(total_eval_num_seqs)))
print("kpis\ttrain_acc\t%0.3f" %
(np.sum(total_eval_acc) /
np.sum(total_eval_num_seqs)))
if steps % args.save_steps == 0:
save_path = "save_dir_" + str(steps)
print('save model to: ' + save_path)
fluid.dygraph.save_dygraph(model.state_dict(),
save_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(__doc__)
utils.load_yaml(parser, 'args.yaml')
args = parser.parse_args()
if args.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = 1
print(args)
train(args, place)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, Embedding
from paddle.fluid.dygraph import GRUUnit
from paddle.fluid.dygraph.base import to_variable
import numpy as np
class DynamicGRU(fluid.dygraph.Layer):
def __init__(self,
scope_name,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__(scope_name)
self.gru_unit = GRUUnit(
self.full_name(),
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res
class LAC(fluid.dygraph.Layer):
def __init__(self,
name_scope,
args,
vocab_size,
num_labels,
for_infer=True,
target=None):
super(LAC, self).__init__(name_scope)
self.word_emb_dim = args.word_emb_dim
self.dict_dim = vocab_size
self.grnn_hidden_dim = args.grnn_hidden_dim
self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
args) else 1.0
self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
args) else 1.0
self.bigru_num = args.bigru_num
self.init_bound = 0.1
self.IS_SPARSE = True
self.max_seq_lens = args.max_seq_lens
self.grnn_hidden_dim = args.grnn_hidden_dim
self._word_embedding = Embedding(
self.full_name(),
size=[vocab_size, self.word_emb_dim],
dtype='float32',
is_sparse=self.IS_SPARSE,
param_attr=fluid.ParamAttr(
learning_rate=self.emb_lr,
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound)))
self._emission_fc = FC(
self.full_name(),
size=num_labels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
def _bigru_layer(input_feature, grnn_hidden_dim):
"""
define the bidirectional gru layer
"""
pre_gru = FC(input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru = DynamicGRU(
input=pre_gru,
size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
pre_gru_r = FC(input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru_r = DynamicGRU(
input=pre_gru_r,
size=grnn_hidden_dim,
is_reverse=True,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
return bi_merge
def forward(self, inputs, targets, seq_lens):
emb = self._word_embedding(inputs)
o_np_mask = (inputs.numpy() != self.dict_dim).astype('float32')
mask_emb = fluid.layers.expand(
to_variable(o_np_mask), [1, self.word_emb_dim])
emb = emb * mask_emb
emb = fluid.layers.reshape(
emb, shape=[-1, 1, self.max_seq_lens, self.hid_dim])
input_feature = emb
for i in range(self.bigru_num):
bigru_output = _bigru_layer(input_feature, self._grnn_hidden_dim)
input_feature = bigru_output
emission = self_emission_fc(input_feature)
if targets is not None:
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=crf_lr),
length=seq_lens)
avg_cost = fluid.layers.mean(x=crf_cost)
crf_decode = fluid.layers.crf_decoding(
input=emission,
param_attr=fluid.ParamAttr(name='crfw'),
length=seq_lens)
return avg_cost, crf_decode
else:
size = emission.shape[1]
fluid.layers.create_parameter(
shape=[size + 2, size], dtype=emission.dtype, name='crfw')
crf_decode = fluid.layers.crf_decoding(
input=emission,
param_attr=fluid.ParamAttr(name='crfw'),
length=seq_lens)
return crf_decode
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file_reader converts raw corpus to input.
"""
import os
import argparse
import __future__
import io
import glob
import paddle
def load_kv_dict(dict_path,
reverse=False,
delimiter="\t",
key_func=None,
value_func=None):
"""
Load key-value dict from file
"""
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split(delimiter)
if len(terms) != 2:
continue
if reverse:
value, key = terms
else:
key, value = terms
if key in result_dict:
raise KeyError("key duplicated with [%s]" % (key))
if key_func:
key = key_func(key)
if value_func:
value = value_func(value)
result_dict[key] = value
return result_dict
class Dataset(object):
"""data reader"""
def __init__(self, args, mode="train"):
# read dict
self.word2id_dict = load_kv_dict(
args.word_dict_path, reverse=True, value_func=int)
self.id2word_dict = load_kv_dict(args.word_dict_path)
self.label2id_dict = load_kv_dict(
args.label_dict_path, reverse=True, value_func=int)
self.id2label_dict = load_kv_dict(args.label_dict_path)
self.word_replace_dict = load_kv_dict(args.word_rep_dict_path)
@property
def vocab_size(self):
"""vocabuary size"""
return max(self.word2id_dict.values()) + 1
@property
def num_labels(self):
"""num_labels"""
return max(self.label2id_dict.values()) + 1
def get_num_examples(self, filename):
"""num of line of file"""
return sum(1 for line in io.open(filename, "r", encoding='utf8'))
def word_to_ids(self, words):
"""convert word to word index"""
word_ids = []
for word in words:
word = self.word_replace_dict.get(word, word)
if word not in self.word2id_dict:
word = "OOV"
word_id = self.word2id_dict[word]
word_ids.append(word_id)
return word_ids
def label_to_ids(self, labels):
"""convert label to label index"""
label_ids = []
for label in labels:
if label not in self.label2id_dict:
label = "O"
label_id = self.label2id_dict[label]
label_ids.append(label_id)
return label_ids
def file_reader(self, filename, max_seq_len=64, mode="train"):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def wrapper():
fread = io.open(filename, "r", encoding="utf-8")
if mode == "infer":
for line in fread:
words = line.strip()
word_ids = self.word_to_ids(words)
yield (word_ids[0:max_seq_len], )
else:
headline = next(fread)
headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[
0] == "text_a" and headline[1] == "label"
for line in fread:
words, labels = line.strip("\n").split("\t")
if len(words) < 1:
continue
word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids)
yield word_ids[0:max_seq_len], label_ids[0:max_seq_len]
fread.close()
return wrapper
class LACProcessor(object):
def __init__(self, args, data_dir, vocab_path, random_seed=None):
self.num_examples = {"train": -1, "dev": -1, "infer": -1}
self.args = args
self.dataset = Dataset(args)
self.data_dir = data_dir
def get_train_examples(self, data_dir):
return self.dataset.file_reader(self.data_dir, 65, mode="train")
def get_dev_examples(self, data_dir):
return self.dataset.file_reader(self.data_dir, 65, mode="dev")
def get_test_examples(self, data_dir):
return self.dataset.file_reader(self.data_dir, 65, mode="test")
def data_generator(self, mode='train', epoch=1, shuffle=True):
if mode == "train":
return paddle.batch(
self.get_train_examples(self.data_dir), 300, drop_last=True)
elif mode == "dev":
return paddle.batch(
self.get_dev_examples(self.data_dir), 300, drop_last=True)
elif mode == "infer":
return paddle.batch(
self.get_test_examples(self.data_dir), 300, drop_last=True)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'infer'].")
if __name__ == "__main__":
parser = argparse.ArgumentParser(__doc__)
parser.add_argument(
"--word_dict_path",
type=str,
default="./conf/word.dic",
help="word dict")
parser.add_argument(
"--label_dict_path",
type=str,
default="./conf/tag.dic",
help="label dict")
parser.add_argument(
"--word_rep_dict_path",
type=str,
default="./conf/q2b.dic",
help="word replace dict")
args = parser.parse_args()
dataset = Dataset(args)
processor = LACProcessor(args, "data/train.tsv", args.word_dict_path)
for data in processor.data_generator("train")():
for xx in data:
print(xx)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
util tools
"""
from __future__ import print_function
import os
import sys
import numpy as np
import paddle.fluid as fluid
import yaml
import io
def str2bool(v):
"""
argparse does not support True or False in python
"""
return v.lower() in ("true", "t", "1")
class ArgumentGroup(object):
"""
Put arguments to one group
"""
def __init__(self, parser, title, des):
"""none"""
self._group = parser.add_argument_group(title=title, description=des)
def add_arg(self, name, type, default, help, **kwargs):
""" Add argument """
type = str2bool if type == bool else type
self._group.add_argument(
"--" + name,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def load_yaml(parser, file_name, **kwargs):
with io.open(file_name, 'r', encoding='utf8') as f:
args = yaml.load(f)
for title in args:
group = parser.add_argument_group(title=title, description='')
for name in args[title]:
_type = type(args[title][name]['val'])
_type = str2bool if _type == bool else _type
group.add_argument(
"--" + name,
default=args[title][name]['val'],
type=_type,
help=args[title][name]['meaning'] +
' Default: %(default)s.',
**kwargs)
def print_arguments(args):
"""none"""
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).items()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册