Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
b6bd7386
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b6bd7386
编写于
11月 13, 2019
作者:
xujinanne
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add lac in dygraph
上级
9ce0dcb3
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
693 addition
and
0 deletion
+693
-0
dygraph/lexical_analysis/args.yaml
dygraph/lexical_analysis/args.yaml
+83
-0
dygraph/lexical_analysis/main.py
dygraph/lexical_analysis/main.py
+171
-0
dygraph/lexical_analysis/nets.py
dygraph/lexical_analysis/nets.py
+178
-0
dygraph/lexical_analysis/reader.py
dygraph/lexical_analysis/reader.py
+186
-0
dygraph/lexical_analysis/utils.py
dygraph/lexical_analysis/utils.py
+75
-0
未找到文件。
dygraph/lexical_analysis/args.yaml
0 → 100644
浏览文件 @
b6bd7386
model
:
word_emb_dim
:
val
:
128
meaning
:
"
The
dimension
in
which
a
word
is
embedded."
grnn_hidden_dim
:
val
:
128
meaning
:
"
The
number
of
hidden
nodes
in
the
GRNN
layer."
bigru_num
:
val
:
2
meaning
:
"
The
number
of
bi_gru
layers
in
the
network."
init_checkpoint
:
val
:
"
"
meaning
:
"
Path
to
init
model"
inference_save_dir
:
val
:
"
"
meaning
:
"
Path
to
save
inference
model"
train
:
random_seed
:
val
:
0
meaning
:
"
Random
seed
for
training"
print_steps
:
val
:
1
meaning
:
"
Print
the
result
per
xxx
batch
of
training"
save_steps
:
val
:
10
meaning
:
"
Save
the
model
once
per
xxxx
batch
of
training"
validation_steps
:
val
:
10
meaning
:
"
Do
the
validation
once
per
xxxx
batch
of
training"
batch_size
:
val
:
100
meaning
:
"
The
number
of
sequences
contained
in
a
mini-batch"
epoch
:
val
:
10
meaning
:
"
Corpus
iteration
num"
use_cuda
:
val
:
True
meaning
:
"
If
set,
use
GPU
for
training."
traindata_shuffle_buffer
:
val
:
20000
meaning
:
"
The
buffer
size
used
in
shuffle
the
training
data."
base_learning_rate
:
val
:
0.001
meaning
:
"
The
basic
learning
rate
that
affects
the
entire
network."
emb_learning_rate
:
val
:
2
meaning
:
"
The
real
learning
rate
of
the
embedding
layer
will
be
(emb_learning_rate
*
base_learning_rate)."
crf_learning_rate
:
val
:
0.2
meaning
:
"
The
real
learning
rate
of
the
embedding
layer
will
be
(crf_learning_rate
*
base_learning_rate)."
enable_ce
:
val
:
false
meaning
:
'
If
set,
run
the
task
with
continuous
evaluation
logs.'
cpu_num
:
val
:
10
meaning
:
"
The
number
of
cpu
used
to
train
model,
this
argument
wouldn't
be
valid
if
use_cuda=true"
data
:
word_dict_path
:
val
:
"
./conf/word.dic"
meaning
:
"
The
path
of
the
word
dictionary."
label_dict_path
:
val
:
"
./conf/tag.dic"
meaning
:
"
The
path
of
the
label
dictionary."
word_rep_dict_path
:
val
:
"
./conf/q2b.dic"
meaning
:
"
The
path
of
the
word
replacement
Dictionary."
train_data
:
val
:
"
./data/train.tsv"
meaning
:
"
The
folder
where
the
training
data
is
located."
test_data
:
val
:
"
./data/test.tsv"
meaning
:
"
The
folder
where
the
test
data
is
located."
infer_data
:
val
:
"
./data/infer.tsv"
meaning
:
"
The
folder
where
the
infer
data
is
located."
model_save_dir
:
val
:
"
./models"
meaning
:
"
The
model
will
be
saved
in
this
path."
max_seq_lens
:
val
:
65
meaning
:
"
The
max
sentence
lengths
of
data"
\ No newline at end of file
dygraph/lexical_analysis/main.py
0 → 100644
浏览文件 @
b6bd7386
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
time
import
argparse
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.base
import
to_variable
import
nets
import
reader
import
utils
def
train
(
args
,
place
):
with
fluid
.
dygraph
.
guard
(
place
):
dataset
=
reader
.
Dataset
(
args
)
num_train_examples
=
dataset
.
get_num_examples
(
args
.
train_data
)
max_train_steps
=
args
.
epoch
*
num_train_examples
//
args
.
batch_size
#define reader
train_processor
=
reader
.
LACProcessor
(
args
,
args
.
train_data
,
args
.
word_dict_path
)
test_processor
=
dataset
.
file_reader
(
args
.
test_data
,
mode
=
"test"
)
#define network
model
=
nets
.
LAC
(
"lac_net"
,
args
,
dataset
.
vocab_size
,
args
.
batch_size
,
args
.
max_seq_lens
)
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
args
.
base_learning_rate
)
steps
=
0
total_cost
,
total_acc
,
total_num_seqs
=
[],
[],
[]
for
eop
in
range
(
args
.
epoch
):
time_begin
=
time
.
time
()
for
data
in
train_processor
.
data_generator
(
"train"
)():
steps
+=
1
doc
=
to_variable
(
np
.
array
([
np
.
pad
(
x
[
0
][
0
:
args
.
max_seq_lens
],
(
0
,
args
.
max_seq_lens
-
len
(
x
[
0
][
0
:
args
.
max_seq_lens
])),
'constant'
,
constant_values
=
(
dataset
.
vocab_size
))
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
))
seq_lens
=
to_variable
(
np
.
array
([
len
(
x
[
0
])
for
x
in
data
]).
astype
(
'int64'
))
targets
=
to_variable
(
np
.
array
([
np
.
pad
(
x
[
1
][
0
:
args
.
max_seq_lens
],
(
0
,
args
.
max_seq_lens
-
len
(
x
[
1
][
0
:
args
.
max_seq_lens
])),
'constant'
,
constant_values
=
(
dataset
.
num_labels
))
for
x
in
data
]).
astype
(
'int64'
))
model
.
train
()
avg_cost
,
prediction
,
acc
=
model
(
doc
,
targets
,
seq_lens
)
avg_cost
.
backward
()
np_mask
=
(
doc
.
numpy
()
!=
dataset
.
vocab_size
).
astype
(
'int32'
)
word_num
=
np
.
sum
(
np_mask
)
sgd_optimizer
.
minimize
(
avg_cost
)
model
.
clear_gradients
()
total_cost
.
append
(
avg_cost
.
numpy
()
*
word_num
)
total_acc
.
append
(
acc
.
numpy
()
*
word_num
)
total_num_seqs
.
append
(
word_num
)
if
steps
%
args
.
skip_steps
==
0
:
time_end
=
time
.
time
()
used_time
=
time_end
-
time_begin
print
(
"step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s"
%
(
steps
,
np
.
sum
(
total_cost
)
/
np
.
sum
(
total_num_seqs
),
np
.
sum
(
total_acc
)
/
np
.
sum
(
total_num_seqs
),
args
.
skip_steps
/
used_time
))
total_cost
,
total_acc
,
total_num_seqs
=
[],
[],
[]
time_begin
=
time
.
time
()
if
steps
%
args
.
validation_steps
==
0
:
total_eval_cost
,
total_eval_acc
,
total_eval_num_seqs
=
[],
[],
[]
model
.
eval
()
eval_steps
=
0
for
data
in
train_processor
.
data_generator
(
"train"
)():
steps
+=
1
eval_doc
=
to_variable
(
np
.
array
([
np
.
pad
(
x
[
0
][
0
:
args
.
max_seq_lens
],
(
0
,
args
.
max_seq_lens
-
len
(
x
[
0
][
0
:
args
.
max_seq_lens
])),
'constant'
,
constant_values
=
(
dataset
.
vocab_size
))
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
))
eval_seq_lens
=
to_variable
(
np
.
array
([
len
(
x
[
0
])
for
x
in
data
]).
astype
(
'int64'
)
.
reshape
(
args
.
batch_size
,
1
))
eval_targets
=
to_variable
(
np
.
array
([
np
.
pad
(
x
[
1
][
0
:
args
.
max_seq_lens
],
(
0
,
args
.
max_seq_lens
-
len
(
x
[
1
][
0
:
args
.
max_seq_lens
])),
'constant'
,
constant_values
=
(
dataset
.
num_labels
))
for
x
in
data
]).
astype
(
'int64'
))
eval_avg_cost
,
eval_prediction
,
eval_acc
=
model
(
eval_doc
,
eval_targets
,
eval_seq_lens
)
eval_np_mask
=
(
eval_np_doc
!=
dataset
.
vocab_size
).
astype
(
'int32'
)
eval_word_num
=
np
.
sum
(
eval_np_mask
)
total_eval_cost
.
append
(
eval_avg_cost
.
numpy
()
*
eval_word_num
)
total_eval_acc
.
append
(
eval_acc
.
numpy
()
*
eval_word_num
)
total_eval_num_seqs
.
append
(
eval_word_num
)
eval_steps
+=
1
time_end
=
time
.
time
()
used_time
=
time_end
-
time_begin
print
(
"Final validation result: step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s"
%
(
steps
,
np
.
sum
(
total_eval_cost
)
/
np
.
sum
(
total_eval_num_seqs
),
np
.
sum
(
total_eval_acc
)
/
np
.
sum
(
total_eval_num_seqs
),
eval_steps
/
used_time
))
time_begin
=
time
.
time
()
if
args
.
ce
:
print
(
"kpis
\t
train_loss
\t
%0.3f"
%
(
np
.
sum
(
total_eval_cost
)
/
np
.
sum
(
total_eval_num_seqs
)))
print
(
"kpis
\t
train_acc
\t
%0.3f"
%
(
np
.
sum
(
total_eval_acc
)
/
np
.
sum
(
total_eval_num_seqs
)))
if
steps
%
args
.
save_steps
==
0
:
save_path
=
"save_dir_"
+
str
(
steps
)
print
(
'save model to: '
+
save_path
)
fluid
.
dygraph
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
__doc__
)
utils
.
load_yaml
(
parser
,
'args.yaml'
)
args
=
parser
.
parse_args
()
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
int
(
os
.
getenv
(
'FLAGS_selected_gpus'
,
'0'
)))
dev_count
=
fluid
.
core
.
get_cuda_device_count
()
else
:
place
=
fluid
.
CPUPlace
()
dev_count
=
1
print
(
args
)
train
(
args
,
place
)
dygraph/lexical_analysis/nets.py
0 → 100644
浏览文件 @
b6bd7386
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.nn
import
Conv2D
,
Pool2D
,
FC
,
Embedding
from
paddle.fluid.dygraph
import
GRUUnit
from
paddle.fluid.dygraph.base
import
to_variable
import
numpy
as
np
class
DynamicGRU
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
scope_name
,
size
,
param_attr
=
None
,
bias_attr
=
None
,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
h_0
=
None
,
origin_mode
=
False
,
init_size
=
None
):
super
(
DynamicGRU
,
self
).
__init__
(
scope_name
)
self
.
gru_unit
=
GRUUnit
(
self
.
full_name
(),
size
*
3
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
activation
=
candidate_activation
,
gate_activation
=
gate_activation
,
origin_mode
=
origin_mode
)
self
.
size
=
size
self
.
h_0
=
h_0
self
.
is_reverse
=
is_reverse
def
forward
(
self
,
inputs
):
hidden
=
self
.
h_0
res
=
[]
for
i
in
range
(
inputs
.
shape
[
1
]):
if
self
.
is_reverse
:
i
=
inputs
.
shape
[
1
]
-
1
-
i
input_
=
inputs
[:,
i
:
i
+
1
,
:]
input_
=
fluid
.
layers
.
reshape
(
input_
,
[
-
1
,
input_
.
shape
[
2
]],
inplace
=
False
)
hidden
,
reset
,
gate
=
self
.
gru_unit
(
input_
,
hidden
)
hidden_
=
fluid
.
layers
.
reshape
(
hidden
,
[
-
1
,
1
,
hidden
.
shape
[
1
]],
inplace
=
False
)
res
.
append
(
hidden_
)
if
self
.
is_reverse
:
res
=
res
[::
-
1
]
res
=
fluid
.
layers
.
concat
(
res
,
axis
=
1
)
return
res
class
LAC
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
name_scope
,
args
,
vocab_size
,
num_labels
,
for_infer
=
True
,
target
=
None
):
super
(
LAC
,
self
).
__init__
(
name_scope
)
self
.
word_emb_dim
=
args
.
word_emb_dim
self
.
dict_dim
=
vocab_size
self
.
grnn_hidden_dim
=
args
.
grnn_hidden_dim
self
.
emb_lr
=
args
.
emb_learning_rate
if
'emb_learning_rate'
in
dir
(
args
)
else
1.0
self
.
crf_lr
=
args
.
emb_learning_rate
if
'crf_learning_rate'
in
dir
(
args
)
else
1.0
self
.
bigru_num
=
args
.
bigru_num
self
.
init_bound
=
0.1
self
.
IS_SPARSE
=
True
self
.
max_seq_lens
=
args
.
max_seq_lens
self
.
grnn_hidden_dim
=
args
.
grnn_hidden_dim
self
.
_word_embedding
=
Embedding
(
self
.
full_name
(),
size
=
[
vocab_size
,
self
.
word_emb_dim
],
dtype
=
'float32'
,
is_sparse
=
self
.
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
self
.
emb_lr
,
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
)))
self
.
_emission_fc
=
FC
(
self
.
full_name
(),
size
=
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
self
.
init_bound
,
high
=
self
.
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
def
_bigru_layer
(
input_feature
,
grnn_hidden_dim
):
"""
define the bidirectional gru layer
"""
pre_gru
=
FC
(
input
=
input_feature
,
size
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
gru
=
DynamicGRU
(
input
=
pre_gru
,
size
=
grnn_hidden_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
pre_gru_r
=
FC
(
input
=
input_feature
,
size
=
grnn_hidden_dim
*
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
gru_r
=
DynamicGRU
(
input
=
pre_gru_r
,
size
=
grnn_hidden_dim
,
is_reverse
=
True
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
bi_merge
=
fluid
.
layers
.
concat
(
input
=
[
gru
,
gru_r
],
axis
=
1
)
return
bi_merge
def
forward
(
self
,
inputs
,
targets
,
seq_lens
):
emb
=
self
.
_word_embedding
(
inputs
)
o_np_mask
=
(
inputs
.
numpy
()
!=
self
.
dict_dim
).
astype
(
'float32'
)
mask_emb
=
fluid
.
layers
.
expand
(
to_variable
(
o_np_mask
),
[
1
,
self
.
word_emb_dim
])
emb
=
emb
*
mask_emb
emb
=
fluid
.
layers
.
reshape
(
emb
,
shape
=
[
-
1
,
1
,
self
.
max_seq_lens
,
self
.
hid_dim
])
input_feature
=
emb
for
i
in
range
(
self
.
bigru_num
):
bigru_output
=
_bigru_layer
(
input_feature
,
self
.
_grnn_hidden_dim
)
input_feature
=
bigru_output
emission
=
self_emission_fc
(
input_feature
)
if
targets
is
not
None
:
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
input
=
emission
,
label
=
target
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'crfw'
,
learning_rate
=
crf_lr
),
length
=
seq_lens
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
crf_cost
)
crf_decode
=
fluid
.
layers
.
crf_decoding
(
input
=
emission
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'crfw'
),
length
=
seq_lens
)
return
avg_cost
,
crf_decode
else
:
size
=
emission
.
shape
[
1
]
fluid
.
layers
.
create_parameter
(
shape
=
[
size
+
2
,
size
],
dtype
=
emission
.
dtype
,
name
=
'crfw'
)
crf_decode
=
fluid
.
layers
.
crf_decoding
(
input
=
emission
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'crfw'
),
length
=
seq_lens
)
return
crf_decode
dygraph/lexical_analysis/reader.py
0 → 100644
浏览文件 @
b6bd7386
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file_reader converts raw corpus to input.
"""
import
os
import
argparse
import
__future__
import
io
import
glob
import
paddle
def
load_kv_dict
(
dict_path
,
reverse
=
False
,
delimiter
=
"
\t
"
,
key_func
=
None
,
value_func
=
None
):
"""
Load key-value dict from file
"""
result_dict
=
{}
for
line
in
io
.
open
(
dict_path
,
"r"
,
encoding
=
'utf8'
):
terms
=
line
.
strip
(
"
\n
"
).
split
(
delimiter
)
if
len
(
terms
)
!=
2
:
continue
if
reverse
:
value
,
key
=
terms
else
:
key
,
value
=
terms
if
key
in
result_dict
:
raise
KeyError
(
"key duplicated with [%s]"
%
(
key
))
if
key_func
:
key
=
key_func
(
key
)
if
value_func
:
value
=
value_func
(
value
)
result_dict
[
key
]
=
value
return
result_dict
class
Dataset
(
object
):
"""data reader"""
def
__init__
(
self
,
args
,
mode
=
"train"
):
# read dict
self
.
word2id_dict
=
load_kv_dict
(
args
.
word_dict_path
,
reverse
=
True
,
value_func
=
int
)
self
.
id2word_dict
=
load_kv_dict
(
args
.
word_dict_path
)
self
.
label2id_dict
=
load_kv_dict
(
args
.
label_dict_path
,
reverse
=
True
,
value_func
=
int
)
self
.
id2label_dict
=
load_kv_dict
(
args
.
label_dict_path
)
self
.
word_replace_dict
=
load_kv_dict
(
args
.
word_rep_dict_path
)
@
property
def
vocab_size
(
self
):
"""vocabuary size"""
return
max
(
self
.
word2id_dict
.
values
())
+
1
@
property
def
num_labels
(
self
):
"""num_labels"""
return
max
(
self
.
label2id_dict
.
values
())
+
1
def
get_num_examples
(
self
,
filename
):
"""num of line of file"""
return
sum
(
1
for
line
in
io
.
open
(
filename
,
"r"
,
encoding
=
'utf8'
))
def
word_to_ids
(
self
,
words
):
"""convert word to word index"""
word_ids
=
[]
for
word
in
words
:
word
=
self
.
word_replace_dict
.
get
(
word
,
word
)
if
word
not
in
self
.
word2id_dict
:
word
=
"OOV"
word_id
=
self
.
word2id_dict
[
word
]
word_ids
.
append
(
word_id
)
return
word_ids
def
label_to_ids
(
self
,
labels
):
"""convert label to label index"""
label_ids
=
[]
for
label
in
labels
:
if
label
not
in
self
.
label2id_dict
:
label
=
"O"
label_id
=
self
.
label2id_dict
[
label
]
label_ids
.
append
(
label_id
)
return
label_ids
def
file_reader
(
self
,
filename
,
max_seq_len
=
64
,
mode
=
"train"
):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def
wrapper
():
fread
=
io
.
open
(
filename
,
"r"
,
encoding
=
"utf-8"
)
if
mode
==
"infer"
:
for
line
in
fread
:
words
=
line
.
strip
()
word_ids
=
self
.
word_to_ids
(
words
)
yield
(
word_ids
[
0
:
max_seq_len
],
)
else
:
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
continue
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
yield
word_ids
[
0
:
max_seq_len
],
label_ids
[
0
:
max_seq_len
]
fread
.
close
()
return
wrapper
class
LACProcessor
(
object
):
def
__init__
(
self
,
args
,
data_dir
,
vocab_path
,
random_seed
=
None
):
self
.
num_examples
=
{
"train"
:
-
1
,
"dev"
:
-
1
,
"infer"
:
-
1
}
self
.
args
=
args
self
.
dataset
=
Dataset
(
args
)
self
.
data_dir
=
data_dir
def
get_train_examples
(
self
,
data_dir
):
return
self
.
dataset
.
file_reader
(
self
.
data_dir
,
65
,
mode
=
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
return
self
.
dataset
.
file_reader
(
self
.
data_dir
,
65
,
mode
=
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
return
self
.
dataset
.
file_reader
(
self
.
data_dir
,
65
,
mode
=
"test"
)
def
data_generator
(
self
,
mode
=
'train'
,
epoch
=
1
,
shuffle
=
True
):
if
mode
==
"train"
:
return
paddle
.
batch
(
self
.
get_train_examples
(
self
.
data_dir
),
300
,
drop_last
=
True
)
elif
mode
==
"dev"
:
return
paddle
.
batch
(
self
.
get_dev_examples
(
self
.
data_dir
),
300
,
drop_last
=
True
)
elif
mode
==
"infer"
:
return
paddle
.
batch
(
self
.
get_test_examples
(
self
.
data_dir
),
300
,
drop_last
=
True
)
else
:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'infer']."
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--word_dict_path"
,
type
=
str
,
default
=
"./conf/word.dic"
,
help
=
"word dict"
)
parser
.
add_argument
(
"--label_dict_path"
,
type
=
str
,
default
=
"./conf/tag.dic"
,
help
=
"label dict"
)
parser
.
add_argument
(
"--word_rep_dict_path"
,
type
=
str
,
default
=
"./conf/q2b.dic"
,
help
=
"word replace dict"
)
args
=
parser
.
parse_args
()
dataset
=
Dataset
(
args
)
processor
=
LACProcessor
(
args
,
"data/train.tsv"
,
args
.
word_dict_path
)
for
data
in
processor
.
data_generator
(
"train"
)():
for
xx
in
data
:
print
(
xx
)
dygraph/lexical_analysis/utils.py
0 → 100644
浏览文件 @
b6bd7386
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
util tools
"""
from
__future__
import
print_function
import
os
import
sys
import
numpy
as
np
import
paddle.fluid
as
fluid
import
yaml
import
io
def
str2bool
(
v
):
"""
argparse does not support True or False in python
"""
return
v
.
lower
()
in
(
"true"
,
"t"
,
"1"
)
class
ArgumentGroup
(
object
):
"""
Put arguments to one group
"""
def
__init__
(
self
,
parser
,
title
,
des
):
"""none"""
self
.
_group
=
parser
.
add_argument_group
(
title
=
title
,
description
=
des
)
def
add_arg
(
self
,
name
,
type
,
default
,
help
,
**
kwargs
):
""" Add argument """
type
=
str2bool
if
type
==
bool
else
type
self
.
_group
.
add_argument
(
"--"
+
name
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
def
load_yaml
(
parser
,
file_name
,
**
kwargs
):
with
io
.
open
(
file_name
,
'r'
,
encoding
=
'utf8'
)
as
f
:
args
=
yaml
.
load
(
f
)
for
title
in
args
:
group
=
parser
.
add_argument_group
(
title
=
title
,
description
=
''
)
for
name
in
args
[
title
]:
_type
=
type
(
args
[
title
][
name
][
'val'
])
_type
=
str2bool
if
_type
==
bool
else
_type
group
.
add_argument
(
"--"
+
name
,
default
=
args
[
title
][
name
][
'val'
],
type
=
_type
,
help
=
args
[
title
][
name
][
'meaning'
]
+
' Default: %(default)s.'
,
**
kwargs
)
def
print_arguments
(
args
):
"""none"""
print
(
'----------- Configuration Arguments -----------'
)
for
arg
,
value
in
sorted
(
vars
(
args
).
items
()):
print
(
'%s: %s'
%
(
arg
,
value
))
print
(
'------------------------------------------------'
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录