Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
acd23c75
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
acd23c75
编写于
4月 09, 2020
作者:
P
pkpk
提交者:
GitHub
4月 09, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #28 from 0YuanZhang0/sequence_tagging
sequence_tagging
上级
e93f9d5d
90db7136
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
808 addition
and
3 deletion
+808
-3
sequence_tagging/reader.py
sequence_tagging/reader.py
+186
-0
sequence_tagging/sequence_tagging.py
sequence_tagging/sequence_tagging.py
+323
-0
text.py
text.py
+299
-3
未找到文件。
sequence_tagging/reader.py
0 → 100644
浏览文件 @
acd23c75
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SequenceTagging dataset
"""
from
__future__
import
division
from
__future__
import
print_function
import
io
import
numpy
as
np
import
paddle.fluid
as
fluid
class
LacDataset
(
object
):
"""
Load lexical analysis dataset
"""
def
__init__
(
self
,
args
):
self
.
word_dict_path
=
args
.
word_dict_path
self
.
label_dict_path
=
args
.
label_dict_path
self
.
word_rep_dict_path
=
args
.
word_rep_dict_path
self
.
_load_dict
()
def
_load_dict
(
self
):
self
.
word2id_dict
=
self
.
load_kv_dict
(
self
.
word_dict_path
,
reverse
=
True
,
value_func
=
np
.
int64
)
self
.
id2word_dict
=
self
.
load_kv_dict
(
self
.
word_dict_path
)
self
.
label2id_dict
=
self
.
load_kv_dict
(
self
.
label_dict_path
,
reverse
=
True
,
value_func
=
np
.
int64
)
self
.
id2label_dict
=
self
.
load_kv_dict
(
self
.
label_dict_path
)
if
self
.
word_rep_dict_path
is
None
:
self
.
word_replace_dict
=
dict
()
else
:
self
.
word_replace_dict
=
self
.
load_kv_dict
(
self
.
word_rep_dict_path
)
def
load_kv_dict
(
self
,
dict_path
,
reverse
=
False
,
delimiter
=
"
\t
"
,
key_func
=
None
,
value_func
=
None
):
"""
Load key-value dict from file
"""
result_dict
=
{}
for
line
in
io
.
open
(
dict_path
,
"r"
,
encoding
=
'utf8'
):
terms
=
line
.
strip
(
"
\n
"
).
split
(
delimiter
)
if
len
(
terms
)
!=
2
:
continue
if
reverse
:
value
,
key
=
terms
else
:
key
,
value
=
terms
if
key
in
result_dict
:
raise
KeyError
(
"key duplicated with [%s]"
%
(
key
))
if
key_func
:
key
=
key_func
(
key
)
if
value_func
:
value
=
value_func
(
value
)
result_dict
[
key
]
=
value
return
result_dict
@
property
def
vocab_size
(
self
):
return
len
(
self
.
word2id_dict
.
values
())
@
property
def
num_labels
(
self
):
return
len
(
self
.
label2id_dict
.
values
())
def
get_num_examples
(
self
,
filename
):
"""num of line of file"""
return
sum
(
1
for
line
in
io
.
open
(
filename
,
"r"
,
encoding
=
'utf8'
))
def
word_to_ids
(
self
,
words
):
"""convert word to word index"""
word_ids
=
[]
for
word
in
words
:
word
=
self
.
word_replace_dict
.
get
(
word
,
word
)
if
word
not
in
self
.
word2id_dict
:
word
=
"OOV"
word_id
=
self
.
word2id_dict
[
word
]
word_ids
.
append
(
word_id
)
return
word_ids
def
label_to_ids
(
self
,
labels
):
"""convert label to label index"""
label_ids
=
[]
for
label
in
labels
:
if
label
not
in
self
.
label2id_dict
:
label
=
"O"
label_id
=
self
.
label2id_dict
[
label
]
label_ids
.
append
(
label_id
)
return
label_ids
def
file_reader
(
self
,
filename
,
mode
=
"train"
,
batch_size
=
32
,
max_seq_len
=
126
):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def
wrapper
():
fread
=
io
.
open
(
filename
,
"r"
,
encoding
=
"utf-8"
)
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
buf
=
[]
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
continue
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
word_ids
=
word_ids
[
0
:
max_seq_len
]
words_len
=
np
.
int64
(
len
(
word_ids
))
word_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
label_ids
=
label_ids
[
0
:
max_seq_len
]
label_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
assert
len
(
word_ids
)
==
len
(
label_ids
)
yield
word_ids
,
label_ids
,
words_len
fread
.
close
()
return
wrapper
def
create_lexnet_data_generator
(
args
,
reader
,
file_name
,
place
,
mode
=
"train"
):
def
wrapper
():
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
for
epoch
in
xrange
(
args
.
epoch
):
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
labels
,
words_len
=
instance
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
batch_labels
.
append
(
labels
)
seq_lens
.
append
(
words_len
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
yield
batch_words
,
batch_labels
,
seq_lens
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
if
len
(
seq_lens
)
>
0
:
yield
batch_words
,
batch_labels
,
seq_lens
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
return
wrapper
def
create_dataloader
(
generator
,
place
,
feed_list
=
None
):
if
not
feed_list
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
else
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
data_loader
.
set_batch_generator
(
generator
,
places
=
place
)
return
data_loader
lac
.py
→
sequence_tagging/sequence_tagging
.py
浏览文件 @
acd23c75
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
lexical analysis
network structure
SequenceTagging
network structure
"""
from
__future__
import
division
...
...
@@ -24,200 +24,60 @@ import sys
import
math
import
argparse
import
numpy
as
np
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))))
from
metrics
import
Metric
from
model
import
Model
,
Input
,
Loss
,
set_device
from
text
import
SequenceTagging
from
reader
import
LacDataset
,
create_lexnet_data_generator
,
create_dataloader
import
paddle.fluid
as
fluid
from
paddle.fluid.optimizer
import
AdamOptimizer
from
paddle.fluid.initializer
import
NormalInitializer
from
paddle.fluid.dygraph.nn
import
Embedding
,
Linear
,
GRUUnit
class
DynamicGRU
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
size
,
h_0
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
origin_mode
=
False
,
init_size
=
None
):
super
(
DynamicGRU
,
self
).
__init__
()
self
.
gru_unit
=
GRUUnit
(
size
*
3
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
activation
=
candidate_activation
,
gate_activation
=
gate_activation
,
origin_mode
=
origin_mode
)
self
.
size
=
size
self
.
h_0
=
h_0
self
.
is_reverse
=
is_reverse
def
forward
(
self
,
inputs
):
hidden
=
self
.
h_0
res
=
[]
for
i
in
range
(
inputs
.
shape
[
1
]):
if
self
.
is_reverse
:
i
=
inputs
.
shape
[
1
]
-
1
-
i
input_
=
inputs
[:,
i
:
i
+
1
,
:]
input_
=
fluid
.
layers
.
reshape
(
input_
,
[
-
1
,
input_
.
shape
[
2
]],
inplace
=
False
)
hidden
,
reset
,
gate
=
self
.
gru_unit
(
input_
,
hidden
)
hidden_
=
fluid
.
layers
.
reshape
(
hidden
,
[
-
1
,
1
,
hidden
.
shape
[
1
]],
inplace
=
False
)
res
.
append
(
hidden_
)
if
self
.
is_reverse
:
res
=
res
[::
-
1
]
res
=
fluid
.
layers
.
concat
(
res
,
axis
=
1
)
return
res
class
SeqTagging
(
Model
):
def
__init__
(
self
,
args
,
vocab_size
,
num_labels
,
length
=
None
):
super
(
SeqTagging
,
self
).
__init__
()
"""
define the lexical analysis network structure
word: stores the input of the model
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
return:
for infer: return the prediction
otherwise: return the prediction
"""
self
.
word_emb_dim
=
args
.
word_emb_dim
self
.
vocab_size
=
vocab_size
self
.
num_labels
=
num_labels
self
.
grnn_hidden_dim
=
args
.
grnn_hidden_dim
self
.
emb_lr
=
args
.
emb_learning_rate
if
'emb_learning_rate'
in
dir
(
args
)
else
1.0
self
.
crf_lr
=
args
.
emb_learning_rate
if
'crf_learning_rate'
in
dir
(
args
)
else
1.0
self
.
bigru_num
=
args
.
bigru_num
self
.
batch_size
=
args
.
batch_size
self
.
init_bound
=
0.1
self
.
length
=
length
self
.
sequence_tagging
=
SequenceTagging
(
vocab_size
=
self
.
vocab_size
,
num_labels
=
self
.
num_labels
,
batch_size
=
self
.
batch_size
,
word_emb_dim
=
self
.
word_emb_dim
,
grnn_hidden_dim
=
self
.
grnn_hidden_dim
,
emb_learning_rate
=
self
.
emb_lr
,
crf_learning_rate
=
self
.
crf_lr
,
bigru_num
=
self
.
bigru_num
,
init_bound
=
self
.
init_bound
,
length
=
self
.
length
)
class
BiGRU
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
input_dim
,
grnn_hidden_dim
,
init_bound
,
h_0
=
None
):
super
(
BiGRU
,
self
).
__init__
()
self
.
pre_gru
=
Linear
(
input_dim
=
input_dim
,
output_dim
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru
=
DynamicGRU
(
size
=
grnn_hidden_dim
,
h_0
=
h_0
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
pre_gru_r
=
Linear
(
input_dim
=
input_dim
,
output_dim
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru_r
=
DynamicGRU
(
size
=
grnn_hidden_dim
,
is_reverse
=
True
,
h_0
=
h_0
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
def
forward
(
self
,
input_feature
):
res_pre_gru
=
self
.
pre_gru
(
input_feature
)
res_gru
=
self
.
gru
(
res_pre_gru
)
res_pre_gru_r
=
self
.
pre_gru_r
(
input_feature
)
res_gru_r
=
self
.
gru_r
(
res_pre_gru_r
)
bi_merge
=
fluid
.
layers
.
concat
(
input
=
[
res_gru
,
res_gru_r
],
axis
=-
1
)
return
bi_merge
class
Linear_chain_crf
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
param_attr
,
size
=
None
,
is_test
=
False
,
dtype
=
'float32'
):
super
(
Linear_chain_crf
,
self
).
__init__
()
self
.
_param_attr
=
param_attr
self
.
_dtype
=
dtype
self
.
_size
=
size
self
.
_is_test
=
is_test
self
.
_transition
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
self
.
_size
+
2
,
self
.
_size
],
dtype
=
self
.
_dtype
)
@
property
def
weight
(
self
):
return
self
.
_transition
@
weight
.
setter
def
weight
(
self
,
value
):
self
.
_transition
=
value
def
forward
(
self
,
input
,
label
,
length
=
None
):
alpha
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
emission_exps
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
transition_exps
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
log_likelihood
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
this_inputs
=
{
"Emission"
:
[
input
],
"Transition"
:
self
.
_transition
,
"Label"
:
[
label
]
}
if
length
:
this_inputs
[
'Length'
]
=
[
length
]
self
.
_helper
.
append_op
(
type
=
'linear_chain_crf'
,
inputs
=
this_inputs
,
outputs
=
{
"Alpha"
:
[
alpha
],
"EmissionExps"
:
[
emission_exps
],
"TransitionExps"
:
transition_exps
,
"LogLikelihood"
:
log_likelihood
},
attrs
=
{
"is_test"
:
self
.
_is_test
,
})
return
log_likelihood
class
Crf_decoding
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
param_attr
,
size
=
None
,
is_test
=
False
,
dtype
=
'float32'
):
super
(
Crf_decoding
,
self
).
__init__
()
self
.
_dtype
=
dtype
self
.
_size
=
size
self
.
_is_test
=
is_test
self
.
_param_attr
=
param_attr
self
.
_transition
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
self
.
_size
+
2
,
self
.
_size
],
dtype
=
self
.
_dtype
)
@
property
def
weight
(
self
):
return
self
.
_transition
@
weight
.
setter
def
weight
(
self
,
value
):
self
.
_transition
=
value
def
forward
(
self
,
input
,
label
=
None
,
length
=
None
):
viterbi_path
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
this_inputs
=
{
"Emission"
:
[
input
],
"Transition"
:
self
.
_transition
,
"Label"
:
label
}
if
length
:
this_inputs
[
'Length'
]
=
[
length
]
self
.
_helper
.
append_op
(
type
=
'crf_decoding'
,
inputs
=
this_inputs
,
outputs
=
{
"ViterbiPath"
:
[
viterbi_path
]},
attrs
=
{
"is_test"
:
self
.
_is_test
,
})
return
viterbi_path
def
forward
(
self
,
word
,
target
,
lengths
):
"""
Configure the network
"""
crf_decode
,
avg_cost
,
lengths
=
self
.
sequence_tagging
(
word
,
target
,
lengths
)
return
crf_decode
,
avg_cost
,
lengths
class
Chunk_eval
(
fluid
.
dygraph
.
Layer
):
...
...
@@ -266,107 +126,6 @@ class Chunk_eval(fluid.dygraph.Layer):
return
(
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
class
LAC
(
Model
):
def
__init__
(
self
,
args
,
vocab_size
,
num_labels
,
length
=
None
):
super
(
LAC
,
self
).
__init__
()
"""
define the lexical analysis network structure
word: stores the input of the model
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
return:
for infer: return the prediction
otherwise: return the prediction
"""
self
.
word_emb_dim
=
args
.
word_emb_dim
self
.
vocab_size
=
vocab_size
self
.
num_labels
=
num_labels
self
.
grnn_hidden_dim
=
args
.
grnn_hidden_dim
self
.
emb_lr
=
args
.
emb_learning_rate
if
'emb_learning_rate'
in
dir
(
args
)
else
1.0
self
.
crf_lr
=
args
.
emb_learning_rate
if
'crf_learning_rate'
in
dir
(
args
)
else
1.0
self
.
bigru_num
=
args
.
bigru_num
self
.
init_bound
=
0.1
self
.
word_embedding
=
Embedding
(
size
=
[
self
.
vocab_size
,
self
.
word_emb_dim
],
dtype
=
'float32'
,
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
self
.
emb_lr
,
name
=
"word_emb"
,
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
)))
h_0
=
fluid
.
layers
.
create_global_var
(
shape
=
[
args
.
batch_size
,
self
.
grnn_hidden_dim
],
value
=
0.0
,
dtype
=
'float32'
,
persistable
=
True
,
force_cpu
=
True
,
name
=
'h_0'
)
self
.
bigru_units
=
[]
for
i
in
range
(
self
.
bigru_num
):
if
i
==
0
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
else
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
*
2
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
self
.
fc
=
Linear
(
input_dim
=
self
.
grnn_hidden_dim
*
2
,
output_dim
=
self
.
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
linear_chain_crf
=
Linear_chain_crf
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'linear_chain_crfw'
,
learning_rate
=
self
.
crf_lr
),
size
=
self
.
num_labels
)
self
.
crf_decoding
=
Crf_decoding
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'crfw'
,
learning_rate
=
self
.
crf_lr
),
size
=
self
.
num_labels
)
def
forward
(
self
,
word
,
target
,
lengths
):
"""
Configure the network
"""
word_embed
=
self
.
word_embedding
(
word
)
input_feature
=
word_embed
for
i
in
range
(
self
.
bigru_num
):
bigru_output
=
self
.
bigru_units
[
i
](
input_feature
)
input_feature
=
bigru_output
emission
=
self
.
fc
(
bigru_output
)
crf_cost
=
self
.
linear_chain_crf
(
input
=
emission
,
label
=
target
,
length
=
lengths
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
crf_cost
)
self
.
crf_decoding
.
weight
=
self
.
linear_chain_crf
.
weight
crf_decode
=
self
.
crf_decoding
(
input
=
emission
,
length
=
lengths
)
return
crf_decode
,
avg_cost
,
lengths
class
LacLoss
(
Loss
):
def
__init__
(
self
):
super
(
LacLoss
,
self
).
__init__
()
...
...
@@ -431,166 +190,6 @@ class ChunkEval(Metric):
return
self
.
_name
class
LacDataset
(
object
):
"""
Load lexical analysis dataset
"""
def
__init__
(
self
,
args
):
self
.
word_dict_path
=
args
.
word_dict_path
self
.
label_dict_path
=
args
.
label_dict_path
self
.
word_rep_dict_path
=
args
.
word_rep_dict_path
self
.
_load_dict
()
def
_load_dict
(
self
):
self
.
word2id_dict
=
self
.
load_kv_dict
(
self
.
word_dict_path
,
reverse
=
True
,
value_func
=
np
.
int64
)
self
.
id2word_dict
=
self
.
load_kv_dict
(
self
.
word_dict_path
)
self
.
label2id_dict
=
self
.
load_kv_dict
(
self
.
label_dict_path
,
reverse
=
True
,
value_func
=
np
.
int64
)
self
.
id2label_dict
=
self
.
load_kv_dict
(
self
.
label_dict_path
)
if
self
.
word_rep_dict_path
is
None
:
self
.
word_replace_dict
=
dict
()
else
:
self
.
word_replace_dict
=
self
.
load_kv_dict
(
self
.
word_rep_dict_path
)
def
load_kv_dict
(
self
,
dict_path
,
reverse
=
False
,
delimiter
=
"
\t
"
,
key_func
=
None
,
value_func
=
None
):
"""
Load key-value dict from file
"""
result_dict
=
{}
for
line
in
io
.
open
(
dict_path
,
"r"
,
encoding
=
'utf8'
):
terms
=
line
.
strip
(
"
\n
"
).
split
(
delimiter
)
if
len
(
terms
)
!=
2
:
continue
if
reverse
:
value
,
key
=
terms
else
:
key
,
value
=
terms
if
key
in
result_dict
:
raise
KeyError
(
"key duplicated with [%s]"
%
(
key
))
if
key_func
:
key
=
key_func
(
key
)
if
value_func
:
value
=
value_func
(
value
)
result_dict
[
key
]
=
value
return
result_dict
@
property
def
vocab_size
(
self
):
return
len
(
self
.
word2id_dict
.
values
())
@
property
def
num_labels
(
self
):
return
len
(
self
.
label2id_dict
.
values
())
def
get_num_examples
(
self
,
filename
):
"""num of line of file"""
return
sum
(
1
for
line
in
io
.
open
(
filename
,
"r"
,
encoding
=
'utf8'
))
def
word_to_ids
(
self
,
words
):
"""convert word to word index"""
word_ids
=
[]
for
word
in
words
:
word
=
self
.
word_replace_dict
.
get
(
word
,
word
)
if
word
not
in
self
.
word2id_dict
:
word
=
"OOV"
word_id
=
self
.
word2id_dict
[
word
]
word_ids
.
append
(
word_id
)
return
word_ids
def
label_to_ids
(
self
,
labels
):
"""convert label to label index"""
label_ids
=
[]
for
label
in
labels
:
if
label
not
in
self
.
label2id_dict
:
label
=
"O"
label_id
=
self
.
label2id_dict
[
label
]
label_ids
.
append
(
label_id
)
return
label_ids
def
file_reader
(
self
,
filename
,
mode
=
"train"
,
batch_size
=
32
,
max_seq_len
=
126
):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def
wrapper
():
fread
=
io
.
open
(
filename
,
"r"
,
encoding
=
"utf-8"
)
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
buf
=
[]
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
continue
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
word_ids
=
word_ids
[
0
:
max_seq_len
]
words_len
=
np
.
int64
(
len
(
word_ids
))
word_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
label_ids
=
label_ids
[
0
:
max_seq_len
]
label_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
assert
len
(
word_ids
)
==
len
(
label_ids
)
yield
word_ids
,
label_ids
,
words_len
fread
.
close
()
return
wrapper
def
create_lexnet_data_generator
(
args
,
reader
,
file_name
,
place
,
mode
=
"train"
):
def
wrapper
():
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
for
epoch
in
xrange
(
args
.
epoch
):
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
labels
,
words_len
=
instance
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
batch_labels
.
append
(
labels
)
seq_lens
.
append
(
words_len
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
yield
batch_words
,
batch_labels
,
seq_lens
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
if
len
(
seq_lens
)
>
0
:
yield
batch_words
,
batch_labels
,
seq_lens
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
return
wrapper
def
create_dataloader
(
generator
,
place
,
feed_list
=
None
):
if
not
feed_list
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
else
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
data_loader
.
set_batch_generator
(
generator
,
places
=
place
)
return
data_loader
def
main
(
args
):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
...
...
@@ -603,15 +202,11 @@ def main(args):
]
labels
=
[
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'labels'
)]
feed
=
[
x
.
forward
()
for
x
in
inputs
+
labels
]
feed
_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
+
labels
]
dataset
=
LacDataset
(
args
)
train_path
=
os
.
path
.
join
(
args
.
data
,
"train.tsv"
)
test_path
=
os
.
path
.
join
(
args
.
data
,
"test.tsv"
)
if
args
.
dynamic
:
feed_list
=
None
else
:
feed_list
=
feed
train_generator
=
create_lexnet_data_generator
(
args
,
reader
=
dataset
,
file_name
=
train_path
,
place
=
place
,
mode
=
"train"
)
test_generator
=
create_lexnet_data_generator
(
...
...
@@ -624,7 +219,7 @@ def main(args):
vocab_size
=
dataset
.
vocab_size
num_labels
=
dataset
.
num_labels
model
=
LAC
(
args
,
vocab_size
,
num_labels
)
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
)
optim
=
AdamOptimizer
(
learning_rate
=
args
.
base_learning_rate
,
...
...
text.py
浏览文件 @
acd23c75
...
...
@@ -8,7 +8,7 @@ import paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.layers.utils
as
utils
from
paddle.fluid.layers.utils
import
map_structure
,
flatten
,
pack_sequence_as
from
paddle.fluid.dygraph
import
to_variable
,
Embedding
,
Linear
,
LayerNorm
from
paddle.fluid.dygraph
import
to_variable
,
Embedding
,
Linear
,
LayerNorm
,
GRUUnit
from
paddle.fluid.data_feeder
import
convert_dtype
from
paddle.fluid
import
layers
...
...
@@ -19,8 +19,8 @@ __all__ = [
'RNNCell'
,
'BasicLSTMCell'
,
'BasicGRUCell'
,
'RNN'
,
'DynamicDecode'
,
'BeamSearchDecoder'
,
'MultiHeadAttention'
,
'FFN'
,
'TransformerEncoderLayer'
,
'TransformerEncoder'
,
'TransformerDecoderLayer'
,
'TransformerDecoder'
,
'TransformerBeamSearchDecoder'
]
'TransformerDecoder'
,
'TransformerBeamSearchDecoder'
,
'DynamicGRU'
,
'BiGRU'
,
'Linear_chain_crf'
,
'Crf_decoding'
,
'SequenceTagging'
]
class
RNNCell
(
Layer
):
...
...
@@ -998,3 +998,299 @@ class TransformerDecoder(Layer):
decoder_layer
.
cross_attn
.
cal_kv
(
enc_output
,
enc_output
)))
for
decoder_layer
in
self
.
decoder_layers
]
class
DynamicGRU
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
size
,
h_0
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
origin_mode
=
False
,
init_size
=
None
):
super
(
DynamicGRU
,
self
).
__init__
()
self
.
gru_unit
=
GRUUnit
(
size
*
3
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
activation
=
candidate_activation
,
gate_activation
=
gate_activation
,
origin_mode
=
origin_mode
)
self
.
size
=
size
self
.
h_0
=
h_0
self
.
is_reverse
=
is_reverse
def
forward
(
self
,
inputs
):
hidden
=
self
.
h_0
res
=
[]
for
i
in
range
(
inputs
.
shape
[
1
]):
if
self
.
is_reverse
:
i
=
inputs
.
shape
[
1
]
-
1
-
i
input_
=
inputs
[:,
i
:
i
+
1
,
:]
input_
=
fluid
.
layers
.
reshape
(
input_
,
[
-
1
,
input_
.
shape
[
2
]],
inplace
=
False
)
hidden
,
reset
,
gate
=
self
.
gru_unit
(
input_
,
hidden
)
hidden_
=
fluid
.
layers
.
reshape
(
hidden
,
[
-
1
,
1
,
hidden
.
shape
[
1
]],
inplace
=
False
)
res
.
append
(
hidden_
)
if
self
.
is_reverse
:
res
=
res
[::
-
1
]
res
=
fluid
.
layers
.
concat
(
res
,
axis
=
1
)
return
res
class
BiGRU
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
input_dim
,
grnn_hidden_dim
,
init_bound
,
h_0
=
None
):
super
(
BiGRU
,
self
).
__init__
()
self
.
pre_gru
=
Linear
(
input_dim
=
input_dim
,
output_dim
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru
=
DynamicGRU
(
size
=
grnn_hidden_dim
,
h_0
=
h_0
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
pre_gru_r
=
Linear
(
input_dim
=
input_dim
,
output_dim
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru_r
=
DynamicGRU
(
size
=
grnn_hidden_dim
,
is_reverse
=
True
,
h_0
=
h_0
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
def
forward
(
self
,
input_feature
):
res_pre_gru
=
self
.
pre_gru
(
input_feature
)
res_gru
=
self
.
gru
(
res_pre_gru
)
res_pre_gru_r
=
self
.
pre_gru_r
(
input_feature
)
res_gru_r
=
self
.
gru_r
(
res_pre_gru_r
)
bi_merge
=
fluid
.
layers
.
concat
(
input
=
[
res_gru
,
res_gru_r
],
axis
=-
1
)
return
bi_merge
class
Linear_chain_crf
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
param_attr
,
size
=
None
,
is_test
=
False
,
dtype
=
'float32'
):
super
(
Linear_chain_crf
,
self
).
__init__
()
self
.
_param_attr
=
param_attr
self
.
_dtype
=
dtype
self
.
_size
=
size
self
.
_is_test
=
is_test
self
.
_transition
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
self
.
_size
+
2
,
self
.
_size
],
dtype
=
self
.
_dtype
)
@
property
def
weight
(
self
):
return
self
.
_transition
@
weight
.
setter
def
weight
(
self
,
value
):
self
.
_transition
=
value
def
forward
(
self
,
input
,
label
,
length
=
None
):
alpha
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
emission_exps
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
transition_exps
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
log_likelihood
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
this_inputs
=
{
"Emission"
:
[
input
],
"Transition"
:
self
.
_transition
,
"Label"
:
[
label
]
}
if
length
:
this_inputs
[
'Length'
]
=
[
length
]
self
.
_helper
.
append_op
(
type
=
'linear_chain_crf'
,
inputs
=
this_inputs
,
outputs
=
{
"Alpha"
:
[
alpha
],
"EmissionExps"
:
[
emission_exps
],
"TransitionExps"
:
transition_exps
,
"LogLikelihood"
:
log_likelihood
},
attrs
=
{
"is_test"
:
self
.
_is_test
,
})
return
log_likelihood
class
Crf_decoding
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
param_attr
,
size
=
None
,
is_test
=
False
,
dtype
=
'float32'
):
super
(
Crf_decoding
,
self
).
__init__
()
self
.
_dtype
=
dtype
self
.
_size
=
size
self
.
_is_test
=
is_test
self
.
_param_attr
=
param_attr
self
.
_transition
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
self
.
_size
+
2
,
self
.
_size
],
dtype
=
self
.
_dtype
)
@
property
def
weight
(
self
):
return
self
.
_transition
@
weight
.
setter
def
weight
(
self
,
value
):
self
.
_transition
=
value
def
forward
(
self
,
input
,
label
=
None
,
length
=
None
):
viterbi_path
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
)
this_inputs
=
{
"Emission"
:
[
input
],
"Transition"
:
self
.
_transition
,
"Label"
:
label
}
if
length
:
this_inputs
[
'Length'
]
=
[
length
]
self
.
_helper
.
append_op
(
type
=
'crf_decoding'
,
inputs
=
this_inputs
,
outputs
=
{
"ViterbiPath"
:
[
viterbi_path
]},
attrs
=
{
"is_test"
:
self
.
_is_test
,
})
return
viterbi_path
class
SequenceTagging
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
vocab_size
,
num_labels
,
batch_size
,
word_emb_dim
=
128
,
grnn_hidden_dim
=
128
,
emb_learning_rate
=
0.1
,
crf_learning_rate
=
0.1
,
bigru_num
=
2
,
init_bound
=
0.1
,
length
=
None
):
super
(
SequenceTagging
,
self
).
__init__
()
"""
define the sequence tagging network structure
word: stores the input of the model
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
return:
for infer: return the prediction
otherwise: return the prediction
"""
self
.
word_emb_dim
=
word_emb_dim
self
.
vocab_size
=
vocab_size
self
.
num_labels
=
num_labels
self
.
grnn_hidden_dim
=
grnn_hidden_dim
self
.
emb_lr
=
emb_learning_rate
self
.
crf_lr
=
crf_learning_rate
self
.
bigru_num
=
bigru_num
self
.
batch_size
=
batch_size
self
.
init_bound
=
0.1
self
.
word_embedding
=
Embedding
(
size
=
[
self
.
vocab_size
,
self
.
word_emb_dim
],
dtype
=
'float32'
,
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
self
.
emb_lr
,
name
=
"word_emb"
,
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
)))
h_0
=
fluid
.
layers
.
create_global_var
(
shape
=
[
self
.
batch_size
,
self
.
grnn_hidden_dim
],
value
=
0.0
,
dtype
=
'float32'
,
persistable
=
True
,
force_cpu
=
True
,
name
=
'h_0'
)
self
.
bigru_units
=
[]
for
i
in
range
(
self
.
bigru_num
):
if
i
==
0
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
else
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
*
2
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
self
.
fc
=
Linear
(
input_dim
=
self
.
grnn_hidden_dim
*
2
,
output_dim
=
self
.
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
linear_chain_crf
=
Linear_chain_crf
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'linear_chain_crfw'
,
learning_rate
=
self
.
crf_lr
),
size
=
self
.
num_labels
)
self
.
crf_decoding
=
Crf_decoding
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'crfw'
,
learning_rate
=
self
.
crf_lr
),
size
=
self
.
num_labels
)
def
forward
(
self
,
word
,
target
,
lengths
):
"""
Configure the network
"""
word_embed
=
self
.
word_embedding
(
word
)
input_feature
=
word_embed
for
i
in
range
(
self
.
bigru_num
):
bigru_output
=
self
.
bigru_units
[
i
](
input_feature
)
input_feature
=
bigru_output
emission
=
self
.
fc
(
bigru_output
)
crf_cost
=
self
.
linear_chain_crf
(
input
=
emission
,
label
=
target
,
length
=
lengths
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
crf_cost
)
self
.
crf_decoding
.
weight
=
self
.
linear_chain_crf
.
weight
crf_decode
=
self
.
crf_decoding
(
input
=
emission
,
length
=
lengths
)
return
crf_decode
,
avg_cost
,
lengths
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录