Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
15aef487
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
15aef487
编写于
4月 26, 2020
作者:
G
Guo Sheng
提交者:
GitHub
4月 26, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #54 from 0YuanZhang0/seq_tag
seq_tag
上级
2004b003
a14ade8d
变更
10
显示空白变更内容
内联
并排
Showing
10 changed file
with
308 addition
and
307 deletion
+308
-307
examples/sequence_tagging/README.md
examples/sequence_tagging/README.md
+14
-17
examples/sequence_tagging/downloads.py
examples/sequence_tagging/downloads.py
+1
-1
examples/sequence_tagging/eval.py
examples/sequence_tagging/eval.py
+19
-39
examples/sequence_tagging/predict.py
examples/sequence_tagging/predict.py
+17
-26
examples/sequence_tagging/reader.py
examples/sequence_tagging/reader.py
+126
-135
examples/sequence_tagging/sequence_tagging.yaml
examples/sequence_tagging/sequence_tagging.yaml
+2
-3
examples/sequence_tagging/train.py
examples/sequence_tagging/train.py
+34
-38
examples/sequence_tagging/utils/configure.py
examples/sequence_tagging/utils/configure.py
+11
-5
examples/sequence_tagging/utils/metrics.py
examples/sequence_tagging/utils/metrics.py
+16
-17
hapi/text/text.py
hapi/text/text.py
+68
-26
未找到文件。
examples/sequence_tagging/README.md
浏览文件 @
15aef487
...
...
@@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词
|模型|Precision|Recall|F1-score|
|:-:|:-:|:-:|:-:|
|Lexical Analysis|8
8.26%|89.20%|88.73
%|
|Lexical Analysis|8
9.57%|89.96%|89.76
%|
## 2. 快速开始
...
...
@@ -22,7 +22,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词
克隆工具集代码库到本地
```
bash
git clone https://github.com/PaddlePaddle/hapi.git
cd
hapi/sequence_tagging
cd
hapi/
examples/
sequence_tagging
```
#### 3. 环境依赖
...
...
@@ -70,7 +70,7 @@ python -u train.py \
--dynamic False
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
True, 动态图设置为Fals
e
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
False, 动态图设置为Tru
e
```
GPU上多卡训练
...
...
@@ -84,7 +84,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 train.py \
--dynamic False
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
True, 动态图设置为Fals
e
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
False, 动态图设置为Tru
e
```
CPU上训练
...
...
@@ -95,7 +95,7 @@ python -u train.py \
--dynamic False
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
True, 动态图设置为Fals
e
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为
False, 动态图设置为Tru
e
```
### 模型预测
...
...
@@ -105,15 +105,13 @@ python -u train.py \
python predict.py
\
--init_from_checkpoint
model_baseline/params
\
--output_file
predict.result
\
--mode
predict
\
--device
cpu
\
--dynamic
False
# --init_from_checkpoint: 初始化模型
# --output_file: 预测结果文件
# --device: 使用gpu还是cpu设备
# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True
```
### 模型评估
...
...
@@ -123,14 +121,12 @@ python predict.py \
# baseline model
python eval.py
\
--init_from_checkpoint
./model_baseline/params
\
--mode
predict
\
--device
cpu
\
--dynamic
False
# --init_from_checkpoint: 初始化模型
# --device: 使用gpu还是cpu设备
# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为False, 动态图设置为True
```
...
...
@@ -196,6 +192,7 @@ Overall Architecture of GRU-CRF-MODEL
├── eval.py # 词法分析评估的脚本
├── downloads.py # 用于下载数据和模型的脚本
├── downloads.sh # 用于下载数据和模型的脚本
├── sequence_tagging.yaml # 模型训练、预测、评估相关配置参数
└──reader.py # 文件读取相关函数
```
...
...
examples/sequence_tagging/downloads.py
浏览文件 @
15aef487
...
...
@@ -35,7 +35,7 @@ FILE_INFO = {
},
'MODEL'
:
{
'name'
:
'sequence_tagging_dy.tar.gz'
,
'md5'
:
"
1125d374c03c8218b6e47325dcf607e3
"
'md5'
:
"
6ba37ceea8f1f764ba1fe227295a6a3b
"
},
}
...
...
examples/sequence_tagging/eval.py
浏览文件 @
15aef487
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SequenceTagging
network
structure
SequenceTagging
eval
structure
"""
from
__future__
import
division
...
...
@@ -25,18 +25,16 @@ import math
import
argparse
import
numpy
as
np
from
train
import
SeqTagging
from
train
import
SeqTagging
,
ChunkEval
,
LacLoss
from
utils.configure
import
PDConfig
from
utils.check
import
check_gpu
,
check_version
from
utils.metrics
import
chunk_count
from
reader
import
LacDataset
,
create_lexnet_data_generator
,
create_dataloader
from
reader
import
LacDataset
,
LacDataLoader
work_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
sys
.
path
.
append
(
os
.
path
.
join
(
work_dir
,
"../"
))
from
hapi.model
import
set_device
,
Input
import
paddle.fluid
as
fluid
from
paddle.fluid.optimizer
import
AdamOptimizer
from
paddle.fluid.layers.utils
import
flatten
...
...
@@ -44,48 +42,30 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
)]
inputs
=
[
Input
(
[
None
,
None
],
'int64'
,
name
=
'words'
),
Input
(
[
None
],
'int64'
,
name
=
'length'
),
Input
(
[
None
,
None
],
'int64'
,
name
=
'target'
)
]
labels
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'labels'
)]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
]
dataset
=
LacDataset
(
args
)
eval_path
=
args
.
test_file
chunk_evaluator
=
fluid
.
metrics
.
ChunkEvaluator
()
chunk_evaluator
.
reset
()
eval_generator
=
create_lexnet_data_generator
(
args
,
reader
=
dataset
,
file_name
=
eval_path
,
place
=
place
,
mode
=
"test"
)
eval_dataset
=
create_dataloader
(
eval_generator
,
place
,
feed_list
=
feed_list
)
eval_dataset
=
LacDataLoader
(
args
,
place
,
phase
=
"test"
)
vocab_size
=
dataset
.
vocab_size
num_labels
=
dataset
.
num_labels
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
)
optim
=
AdamOptimizer
(
learning_rate
=
args
.
base_learning_rate
,
parameter_list
=
model
.
parameters
())
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
,
mode
=
"test"
)
model
.
mode
=
"test"
model
.
prepare
(
inputs
=
inputs
)
model
.
prepare
(
metrics
=
ChunkEval
(
num_labels
),
inputs
=
inputs
,
labels
=
labels
,
device
=
place
)
model
.
load
(
args
.
init_from_checkpoint
,
skip_mismatch
=
True
)
for
data
in
eval_dataset
():
if
len
(
data
)
==
1
:
batch_data
=
data
[
0
]
targets
=
np
.
array
(
batch_data
[
2
])
else
:
batch_data
=
data
targets
=
batch_data
[
2
].
numpy
()
inputs_data
=
[
batch_data
[
0
],
batch_data
[
1
]]
crf_decode
,
length
=
model
.
test
(
inputs
=
inputs_data
)
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
=
chunk_count
(
crf_decode
,
targets
,
length
,
dataset
.
id2label_dict
)
chunk_evaluator
.
update
(
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
precision
,
recall
,
f1
=
chunk_evaluator
.
eval
()
print
(
"[test] P: %.5f, R: %.5f, F1: %.5f"
%
(
precision
,
recall
,
f1
))
model
.
evaluate
(
eval_dataset
.
dataloader
,
batch_size
=
args
.
batch_size
)
if
__name__
==
'__main__'
:
...
...
examples/sequence_tagging/predict.py
浏览文件 @
15aef487
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SequenceTagging
network
structure
SequenceTagging
predict
structure
"""
from
__future__
import
division
...
...
@@ -28,14 +28,13 @@ import numpy as np
from
train
import
SeqTagging
from
utils.check
import
check_gpu
,
check_version
from
utils.configure
import
PDConfig
from
reader
import
LacDataset
,
create_lexnet_data_generator
,
create_datal
oader
from
reader
import
LacDataset
,
LacDataL
oader
work_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
sys
.
path
.
append
(
os
.
path
.
join
(
work_dir
,
"../"
))
from
hapi.model
import
set_device
,
Input
import
paddle.fluid
as
fluid
from
paddle.fluid.optimizer
import
AdamOptimizer
from
paddle.fluid.layers.utils
import
flatten
...
...
@@ -43,26 +42,18 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
)]
inputs
=
[
Input
(
[
None
,
None
],
'int64'
,
name
=
'words'
),
Input
(
[
None
],
'int64'
,
name
=
'length'
)
]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
]
dataset
=
LacDataset
(
args
)
predict_path
=
args
.
predict_file
predict_generator
=
create_lexnet_data_generator
(
args
,
reader
=
dataset
,
file_name
=
predict_path
,
place
=
place
,
mode
=
"predict"
)
predict_dataset
=
create_dataloader
(
predict_generator
,
place
,
feed_list
=
feed_list
)
predict_dataset
=
LacDataLoader
(
args
,
place
,
phase
=
"predict"
)
vocab_size
=
dataset
.
vocab_size
num_labels
=
dataset
.
num_labels
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
)
optim
=
AdamOptimizer
(
learning_rate
=
args
.
base_learning_rate
,
parameter_list
=
model
.
parameters
())
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
,
mode
=
"predict"
)
model
.
mode
=
"test"
model
.
prepare
(
inputs
=
inputs
)
...
...
@@ -70,15 +61,15 @@ def main(args):
model
.
load
(
args
.
init_from_checkpoint
,
skip_mismatch
=
True
)
f
=
open
(
args
.
output_file
,
"wb"
)
for
data
in
predict_dataset
():
for
data
in
predict_dataset
.
dataloader
:
if
len
(
data
)
==
1
:
input_data
=
data
[
0
]
else
:
input_data
=
data
results
,
length
=
model
.
test
(
inputs
=
flatten
(
input_data
))
results
,
length
=
model
.
test
_batch
(
inputs
=
flatten
(
input_data
))
for
i
in
range
(
len
(
results
)):
word_len
=
length
[
i
]
word_ids
=
results
[
i
][:
word_len
]
word_ids
=
results
[
i
][:
word_len
]
tags
=
[
dataset
.
id2label_dict
[
str
(
id
)]
for
id
in
word_ids
]
f
.
write
(
"
\002
"
.
join
(
tags
)
+
"
\n
"
)
...
...
examples/sequence_tagging/reader.py
浏览文件 @
15aef487
...
...
@@ -19,12 +19,19 @@ from __future__ import division
from
__future__
import
print_function
import
io
import
os
import
leveldb
import
numpy
as
np
import
shutil
from
functools
import
partial
import
paddle
from
paddle.io
import
BatchSampler
,
DataLoader
,
Dataset
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
hapi.distributed
import
DistributedBatchSampler
class
LacDataset
(
objec
t
):
class
LacDataset
(
Datase
t
):
"""
Load lexical analysis dataset
"""
...
...
@@ -34,6 +41,7 @@ class LacDataset(object):
self
.
label_dict_path
=
args
.
label_dict_path
self
.
word_rep_dict_path
=
args
.
word_rep_dict_path
self
.
_load_dict
()
self
.
examples
=
[]
def
_load_dict
(
self
):
self
.
word2id_dict
=
self
.
load_kv_dict
(
...
...
@@ -108,152 +116,135 @@ class LacDataset(object):
label_ids
.
append
(
label_id
)
return
label_ids
def
file_reader
(
self
,
filename
,
mode
=
"train"
,
batch_size
=
32
,
max_seq_len
=
126
):
def
file_reader
(
self
,
filename
,
phase
=
"train"
):
"""
yield (word_idx, target_idx) one by one from file,
or yield (word_idx, ) in `infer` mode
"""
def
wrapper
():
fread
=
io
.
open
(
filename
,
"r"
,
encoding
=
"utf-8"
)
if
mode
==
"train"
:
headline
=
next
(
fread
)
self
.
phase
=
phase
with
io
.
open
(
filename
,
"r"
,
encoding
=
"utf8"
)
as
fr
:
if
phase
in
[
"train"
,
"test"
]:
headline
=
next
(
fr
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
buf
=
[]
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
continue
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
words_len
=
np
.
int64
(
len
(
word_ids
))
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
word_ids
=
word_ids
[
0
:
max_seq_len
]
words_len
=
np
.
int64
(
len
(
word_ids
))
word_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
label_ids
=
label_ids
[
0
:
max_seq_len
]
label_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
assert
len
(
word_ids
)
==
len
(
label_ids
)
yield
word_ids
,
label_ids
,
words_len
elif
mode
==
"test"
:
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
buf
=
[]
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
for
line
in
fr
:
line_str
=
line
.
strip
(
"
\n
"
)
if
len
(
line_str
)
<
1
and
len
(
line_str
.
split
(
'
\t
'
))
<
2
:
continue
self
.
examples
.
append
(
line_str
)
else
:
for
idx
,
line
in
enumerate
(
fr
):
words
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)[
0
]
self
.
examples
.
append
(
words
)
def
__getitem__
(
self
,
idx
):
line_str
=
self
.
examples
[
idx
]
if
self
.
phase
in
[
"train"
,
"test"
]:
words
,
labels
=
line_str
.
split
(
'
\t
'
)
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
words_len
=
np
.
int64
(
len
(
word_ids
))
yield
word_ids
,
label_ids
,
words_len
return
word_ids
,
label_ids
else
:
for
line
in
fread
:
words
=
line
.
strip
(
"
\n
"
).
split
(
'
\t
'
)[
0
]
if
words
==
u
"text_a"
:
continue
if
"
\002
"
not
in
words
:
words
=
[
w
for
w
in
line_str
]
word_ids
=
self
.
word_to_ids
(
words
)
else
:
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
words_len
=
np
.
int64
(
len
(
word_ids
))
yield
word_ids
,
words_len
return
word_ids
fread
.
close
()
def
__len__
(
self
):
return
wrapper
return
len
(
self
.
examples
)
def
create_lexnet_data_generator
(
args
,
reader
,
file_name
,
place
,
mode
=
"train"
):
def
padding_data
(
max_len
,
batch_data
):
def
create_lexnet_data_generator
(
args
,
insts
,
phase
=
"train"
):
def
padding_data
(
max_len
,
batch_data
,
if_len
=
False
):
padding_batch_data
=
[]
padding_lens
=
[]
for
data
in
batch_data
:
data
=
data
[:
max_len
]
if
if_len
:
seq_len
=
np
.
int64
(
len
(
data
))
padding_lens
.
append
(
seq_len
)
data
+=
[
0
for
_
in
range
(
max_len
-
len
(
data
))]
padding_batch_data
.
append
(
data
)
return
padding_batch_data
def
wrapper
():
if
mode
==
"train"
:
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
for
epoch
in
xrange
(
args
.
epoch
):
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
labels
,
words_len
=
instance
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
batch_labels
.
append
(
labels
)
seq_lens
.
append
(
words_len
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
yield
batch_words
,
seq_lens
,
batch_labels
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
if
len
(
seq_lens
)
>
0
:
yield
batch_words
,
seq_lens
,
batch_labels
,
batch_labels
elif
mode
==
"test"
:
batch_words
,
batch_labels
,
seq_lens
,
max_len
=
[],
[],
[],
0
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
labels
,
words_len
=
instance
max_len
=
words_len
if
words_len
>
max_len
else
max_len
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
seq_lens
.
append
(
words_len
)
batch_labels
.
append
(
labels
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
padding_batch_labels
=
padding_data
(
max_len
,
batch_labels
)
yield
padding_batch_words
,
seq_lens
,
padding_batch_labels
,
padding_batch_labels
batch_words
,
batch_labels
,
seq_lens
,
max_len
=
[],
[],
[],
0
if
len
(
seq_lens
)
>
0
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
padding_batch_labels
=
padding_data
(
max_len
,
batch_labels
)
yield
padding_batch_words
,
seq_lens
,
padding_batch_labels
,
padding_batch_labels
if
if_len
:
return
np
.
array
(
padding_batch_data
),
np
.
array
(
padding_lens
)
else
:
batch_words
,
seq_lens
,
max_len
=
[],
[],
0
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
words_len
=
instance
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
seq_lens
.
append
(
words_len
)
max_len
=
words_len
if
words_len
>
max_len
else
max_len
if
len
(
seq_lens
)
==
args
.
batch_size
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
yield
padding_batch_words
,
seq_lens
batch_words
,
seq_lens
,
max_len
=
[],
[],
0
if
len
(
seq_lens
)
>
0
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
yield
padding_batch_words
,
seq_lens
return
wrapper
def
create_dataloader
(
generator
,
place
,
feed_list
=
None
):
if
not
feed_list
:
data_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
return
np
.
array
(
padding_batch_data
)
if
phase
==
"train"
:
batch_words
=
[
inst
[
0
]
for
inst
in
insts
]
batch_labels
=
[
inst
[
1
]
for
inst
in
insts
]
padding_batch_words
,
padding_lens
=
padding_data
(
args
.
max_seq_len
,
batch_words
,
if_len
=
True
)
padding_batch_labels
=
padding_data
(
args
.
max_seq_len
,
batch_labels
)
return
[
padding_batch_words
,
padding_lens
,
padding_batch_labels
,
padding_batch_labels
]
elif
phase
==
"test"
:
batch_words
=
[
inst
[
0
]
for
inst
in
insts
]
seq_len
=
[
len
(
inst
[
0
])
for
inst
in
insts
]
max_seq_len
=
max
(
seq_len
)
batch_labels
=
[
inst
[
1
]
for
inst
in
insts
]
padding_batch_words
,
padding_lens
=
padding_data
(
max_seq_len
,
batch_words
,
if_len
=
True
)
padding_batch_labels
=
padding_data
(
max_seq_len
,
batch_labels
)
return
[
padding_batch_words
,
padding_lens
,
padding_batch_labels
,
padding_batch_labels
]
else
:
batch_words
=
insts
seq_len
=
[
len
(
inst
)
for
inst
in
insts
]
max_seq_len
=
max
(
seq_len
)
padding_batch_words
,
padding_lens
=
padding_data
(
max_seq_len
,
batch_words
,
if_len
=
True
)
return
[
padding_batch_words
,
padding_lens
]
class
LacDataLoader
(
object
):
def
__init__
(
self
,
args
,
place
,
phase
=
"train"
,
shuffle
=
False
,
num_workers
=
0
,
drop_last
=
False
):
assert
phase
in
[
"train"
,
"test"
,
"predict"
],
"phase should be in [train, test, predict], but get %s"
%
phase
if
phase
==
"train"
:
file_name
=
args
.
train_file
elif
phase
==
"test"
:
file_name
=
args
.
test_file
elif
phase
==
"predict"
:
file_name
=
args
.
predict_file
self
.
dataset
=
LacDataset
(
args
)
self
.
dataset
.
file_reader
(
file_name
,
phase
=
phase
)
if
phase
==
"train"
:
self
.
sampler
=
DistributedBatchSampler
(
dataset
=
self
.
dataset
,
batch_size
=
args
.
batch_size
,
shuffle
=
shuffle
,
drop_last
=
drop_last
)
else
:
data_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
self
.
sampler
=
BatchSampler
(
dataset
=
self
.
dataset
,
batch_size
=
args
.
batch_size
,
shuffle
=
shuffle
,
drop_last
=
drop_last
)
self
.
dataloader
=
DataLoader
(
dataset
=
self
.
dataset
,
batch_sampler
=
self
.
sampler
,
places
=
place
,
collate_fn
=
partial
(
create_lexnet_data_generator
,
args
,
phase
=
phase
),
num_workers
=
num_workers
,
return_list
=
True
)
data_loader
.
set_batch_generator
(
generator
,
places
=
place
)
return
data_loader
examples/sequence_tagging/sequence_tagging.yaml
浏览文件 @
15aef487
word_dict_path
:
"
./conf/word.dic"
label_dict_path
:
"
./conf/tag.dic"
word_rep_dict_path
:
"
./conf/q2b.dic"
device
:
"
c
pu"
device
:
"
g
pu"
dynamic
:
True
epoch
:
10
base_learning_rate
:
0.001
...
...
@@ -14,7 +14,7 @@ batch_size: 300
max_seq_len
:
126
num_devices
:
1
save_dir
:
"
model"
init_from_checkpoint
:
"
model_baseline/params
"
init_from_checkpoint
:
"
"
init_from_pretrain_model
:
"
"
save_freq
:
1
eval_freq
:
1
...
...
@@ -22,4 +22,3 @@ output_file: "predict.result"
test_file
:
"
./data/test.tsv"
train_file
:
"
./data/train.tsv"
predict_file
:
"
./data/infer.tsv"
mode
:
"
train"
examples/sequence_tagging/train.py
浏览文件 @
15aef487
...
...
@@ -28,21 +28,23 @@ import numpy as np
work_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
sys
.
path
.
append
(
os
.
path
.
join
(
work_dir
,
"../"
))
from
hapi.metrics
import
Metric
from
hapi.model
import
Model
,
Input
,
Loss
,
set_device
from
hapi.text.text
import
SequenceTagging
from
utils.check
import
check_gpu
,
check_version
from
utils.configure
import
PDConfig
from
reader
import
LacDataset
,
create_lexnet_data_generator
,
create_dataloader
from
reader
import
LacDataset
,
LacDataLoader
import
paddle.fluid
as
fluid
from
paddle.fluid.optimizer
import
AdamOptimizer
__all__
=
[
"SeqTagging"
,
"LacLoss"
,
"ChunkEval"
]
class
SeqTagging
(
Model
):
def
__init__
(
self
,
args
,
vocab_size
,
num_labels
,
length
=
None
):
def
__init__
(
self
,
args
,
vocab_size
,
num_labels
,
length
=
None
,
mode
=
"train"
):
super
(
SeqTagging
,
self
).
__init__
()
"""
define the lexical analysis network structure
...
...
@@ -53,7 +55,7 @@ class SeqTagging(Model):
for infer: return the prediction
otherwise: return the prediction
"""
self
.
mode_type
=
args
.
mode
self
.
mode_type
=
mode
self
.
word_emb_dim
=
args
.
word_emb_dim
self
.
vocab_size
=
vocab_size
self
.
num_labels
=
num_labels
...
...
@@ -65,7 +67,7 @@ class SeqTagging(Model):
self
.
bigru_num
=
args
.
bigru_num
self
.
batch_size
=
args
.
batch_size
self
.
init_bound
=
0.1
self
.
length
=
length
self
.
length
=
length
self
.
sequence_tagging
=
SequenceTagging
(
vocab_size
=
self
.
vocab_size
,
...
...
@@ -207,30 +209,25 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
),
Input
([
None
,
None
],
'int64'
,
name
=
'target'
)]
inputs
=
[
Input
(
[
None
,
None
],
'int64'
,
name
=
'words'
),
Input
(
[
None
],
'int64'
,
name
=
'length'
),
Input
(
[
None
,
None
],
'int64'
,
name
=
'target'
)
]
labels
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'labels'
)]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
+
labels
]
dataset
=
LacDataset
(
args
)
train_path
=
args
.
train_file
test_path
=
args
.
test_file
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
+
labels
]
train_generator
=
create_lexnet_data_generator
(
args
,
reader
=
dataset
,
file_name
=
train_path
,
place
=
place
,
mode
=
"train"
)
test_generator
=
create_lexnet_data_generator
(
args
,
reader
=
dataset
,
file_name
=
test_path
,
place
=
place
,
mode
=
"test"
)
train_dataset
=
create_dataloader
(
train_generator
,
place
,
feed_list
=
feed_list
)
test_dataset
=
create_dataloader
(
test_generator
,
place
,
feed_list
=
feed_list
)
dataset
=
LacDataset
(
args
)
train_dataset
=
LacDataLoader
(
args
,
place
,
phase
=
"train"
)
vocab_size
=
dataset
.
vocab_size
num_labels
=
dataset
.
num_labels
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
)
model
=
SeqTagging
(
args
,
vocab_size
,
num_labels
,
mode
=
"train"
)
optim
=
AdamOptimizer
(
learning_rate
=
args
.
base_learning_rate
,
...
...
@@ -250,8 +247,7 @@ def main(args):
if
args
.
init_from_pretrain_model
:
model
.
load
(
args
.
init_from_pretrain_model
,
reset_optimizer
=
True
)
model
.
fit
(
train_dataset
,
test_dataset
,
model
.
fit
(
train_dataset
.
dataloader
,
epochs
=
args
.
epoch
,
batch_size
=
args
.
batch_size
,
eval_freq
=
args
.
eval_freq
,
...
...
examples/sequence_tagging/utils/configure.py
浏览文件 @
15aef487
...
...
@@ -195,13 +195,19 @@ class PDConfig(object):
"Whether to perform predicting."
)
self
.
default_g
.
add_arg
(
"do_eval"
,
bool
,
False
,
"Whether to perform evaluating."
)
self
.
default_g
.
add_arg
(
"do_save_inference_model"
,
bool
,
False
,
self
.
default_g
.
add_arg
(
"do_save_inference_model"
,
bool
,
False
,
"Whether to perform model saving for inference."
)
# NOTE: args for profiler
self
.
default_g
.
add_arg
(
"is_profiler"
,
int
,
0
,
"the switch of profiler tools. (used for benchmark)"
)
self
.
default_g
.
add_arg
(
"profiler_path"
,
str
,
'./'
,
"the profiler output file path. (used for benchmark)"
)
self
.
default_g
.
add_arg
(
"max_iter"
,
int
,
0
,
"the max train batch num.(used for benchmark)"
)
self
.
default_g
.
add_arg
(
"is_profiler"
,
int
,
0
,
"the switch of profiler tools. (used for benchmark)"
)
self
.
default_g
.
add_arg
(
"profiler_path"
,
str
,
'./'
,
"the profiler output file path. (used for benchmark)"
)
self
.
default_g
.
add_arg
(
"max_iter"
,
int
,
0
,
"the max train batch num.(used for benchmark)"
)
self
.
parser
=
parser
...
...
examples/sequence_tagging/utils/metrics.py
浏览文件 @
15aef487
...
...
@@ -63,8 +63,8 @@ def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
assert
infer_numpy
.
shape
[
0
]
==
label_numpy
.
shape
[
0
]
for
i
in
range
(
infer_numpy
.
shape
[
0
]):
infer_list
=
infer_numpy
[
i
][:
seq_len
[
i
]]
label_list
=
label_numpy
[
i
][:
seq_len
[
i
]]
infer_list
=
infer_numpy
[
i
][:
seq_len
[
i
]]
label_list
=
label_numpy
[
i
][:
seq_len
[
i
]]
infer_dict
=
build_chunk
(
infer_list
,
id2label_dict
)
num_infer_chunks
+=
len
(
infer_dict
)
label_dict
=
build_chunk
(
label_list
,
id2label_dict
)
...
...
@@ -73,4 +73,3 @@ def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
if
key
in
label_dict
and
label_dict
[
key
]
==
infer_dict
[
key
]:
num_correct_chunks
+=
1
return
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
hapi/text/text.py
浏览文件 @
15aef487
...
...
@@ -49,7 +49,7 @@ __all__ = [
'BeamSearchDecoder'
,
'MultiHeadAttention'
,
'FFN'
,
'TransformerEncoderLayer'
,
'TransformerEncoder'
,
'TransformerDecoderLayer'
,
'TransformerDecoder'
,
'TransformerBeamSearchDecoder'
,
'Linear_chain_crf'
,
'Crf_decoding'
,
'SequenceTagging'
'Crf_decoding'
,
'SequenceTagging'
,
'GRUEncoderLayer'
]
...
...
@@ -763,7 +763,7 @@ class BasicGRUCell(RNNCell):
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
return
new_hidden
return
new_hidden
,
new_hidden
@
property
def
state_shape
(
self
):
...
...
@@ -1741,6 +1741,64 @@ class Crf_decoding(fluid.dygraph.Layer):
return
viterbi_path
class
GRUEncoderLayer
(
Layer
):
def
__init__
(
self
,
input_dim
,
grnn_hidden_dim
,
init_bound
,
num_layers
=
1
,
h_0
=
None
,
is_bidirection
=
False
):
super
(
GRUEncoderLayer
,
self
).
__init__
()
self
.
h_0
=
h_0
self
.
num_layers
=
num_layers
self
.
is_bidirection
=
is_bidirection
self
.
gru_list
=
[]
self
.
gru_r_list
=
[]
for
i
in
range
(
num_layers
):
self
.
basic_gru_cell
=
BasicGRUCell
(
input_size
=
input_dim
if
i
==
0
else
input_dim
*
2
,
hidden_size
=
grnn_hidden_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru_list
.
append
(
self
.
add_sublayer
(
"gru_%d"
%
i
,
RNN
(
self
.
basic_gru_cell
,
is_reverse
=
False
,
time_major
=
False
)))
if
self
.
is_bidirection
:
for
i
in
range
(
num_layers
):
self
.
basic_gru_cell_r
=
BasicGRUCell
(
input_size
=
input_dim
if
i
==
0
else
input_dim
*
2
,
hidden_size
=
grnn_hidden_dim
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
init_bound
,
high
=
init_bound
),
regularizer
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-4
)))
self
.
gru_r_list
.
append
(
self
.
add_sublayer
(
"gru_r_%d"
%
i
,
RNN
(
self
.
basic_gru_cell_r
,
is_reverse
=
True
,
time_major
=
False
)))
def
forward
(
self
,
input_feature
):
for
i
in
range
(
self
.
num_layers
):
pre_gru
,
pre_state
=
self
.
gru_list
[
i
](
input_feature
)
if
self
.
is_bidirection
:
gru_r
,
r_state
=
self
.
gru_r_list
[
i
](
input_feature
)
out
=
fluid
.
layers
.
concat
(
input
=
[
pre_gru
,
gru_r
],
axis
=-
1
)
else
:
out
=
pre_gru
input_feature
=
out
return
out
class
SequenceTagging
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
vocab_size
,
...
...
@@ -1790,26 +1848,13 @@ class SequenceTagging(fluid.dygraph.Layer):
force_cpu
=
True
,
name
=
'h_0'
)
self
.
bigru_units
=
[]
for
i
in
range
(
self
.
bigru_num
):
if
i
==
0
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
else
:
self
.
bigru_units
.
append
(
self
.
add_sublayer
(
"bigru_units%d"
%
i
,
BiGRU
(
self
.
grnn_hidden_dim
*
2
,
self
.
grnn_hidden_dim
,
self
.
init_bound
,
h_0
=
h_0
)))
self
.
gru_encoder
=
GRUEncoderLayer
(
input_dim
=
self
.
grnn_hidden_dim
,
grnn_hidden_dim
=
self
.
grnn_hidden_dim
,
init_bound
=
self
.
init_bound
,
num_layers
=
self
.
bigru_num
,
h_0
=
h_0
,
is_bidirection
=
True
)
self
.
fc
=
Linear
(
input_dim
=
self
.
grnn_hidden_dim
*
2
,
...
...
@@ -1837,10 +1882,7 @@ class SequenceTagging(fluid.dygraph.Layer):
word_embed
=
self
.
word_embedding
(
word
)
input_feature
=
word_embed
for
i
in
range
(
self
.
bigru_num
):
bigru_output
=
self
.
bigru_units
[
i
](
input_feature
)
input_feature
=
bigru_output
bigru_output
=
self
.
gru_encoder
(
input_feature
)
emission
=
self
.
fc
(
bigru_output
)
if
target
is
not
None
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录