Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
ERNIE
提交
78489c4b
E
ERNIE
项目概览
PaddlePaddle
/
ERNIE
大约 1 年 前同步成功
通知
109
Star
5997
Fork
1270
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
29
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
ERNIE
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
29
Issue
29
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
78489c4b
编写于
3月 14, 2019
作者:
T
tianxin04
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format
上级
354d97a8
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
199 addition
and
129 deletion
+199
-129
ERNIE/batching.py
ERNIE/batching.py
+16
-3
ERNIE/finetune/classifier.py
ERNIE/finetune/classifier.py
+42
-37
ERNIE/finetune_args.py
ERNIE/finetune_args.py
+2
-1
ERNIE/pretrain_args.py
ERNIE/pretrain_args.py
+0
-1
ERNIE/reader/pretraining.py
ERNIE/reader/pretraining.py
+11
-7
ERNIE/reader/task_reader.py
ERNIE/reader/task_reader.py
+63
-29
ERNIE/run_classifier.py
ERNIE/run_classifier.py
+22
-15
ERNIE/run_sequence_labeling.py
ERNIE/run_sequence_labeling.py
+31
-25
ERNIE/train.py
ERNIE/train.py
+12
-11
未找到文件。
ERNIE/batching.py
浏览文件 @
78489c4b
...
...
@@ -19,7 +19,15 @@ from __future__ import print_function
import
numpy
as
np
def
mask
(
batch_tokens
,
seg_labels
,
mask_word_tags
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
def
mask
(
batch_tokens
,
seg_labels
,
mask_word_tags
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
...
...
@@ -90,7 +98,8 @@ def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size,
# random replace
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
else
:
...
...
@@ -143,7 +152,10 @@ def prepare_batch_data(insts,
pos_id
=
pad_batch_data
(
batch_pos_ids
,
pad_idx
=
pad_id
)
sent_id
=
pad_batch_data
(
batch_sent_ids
,
pad_idx
=
pad_id
)
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_attn_bias
,
mask_label
,
mask_pos
,
labels
,
next_sent_index
]
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_attn_bias
,
mask_label
,
mask_pos
,
labels
,
next_sent_index
]
return
return_list
...
...
@@ -207,4 +219,5 @@ def pad_batch_data(insts,
if
__name__
==
"__main__"
:
pass
ERNIE/finetune/classifier.py
浏览文件 @
78489c4b
...
...
@@ -25,22 +25,20 @@ import paddle.fluid as fluid
from
model.ernie
import
ErnieModel
def
create_model
(
args
,
pyreader_name
,
ernie_config
,
is_prediction
=
False
):
def
create_model
(
args
,
pyreader_name
,
ernie_config
,
is_prediction
=
False
):
pyreader
=
fluid
.
layers
.
py_reader
(
capacity
=
50
,
shapes
=
[[
-
1
,
args
.
max_seq_len
,
1
],
[
-
1
,
args
.
max_seq_len
,
1
],
[
-
1
,
args
.
max_seq_len
,
1
],
[
-
1
,
args
.
max_seq_len
,
args
.
max_seq_len
],
[
-
1
,
1
],
[
-
1
,
1
],
[
-
1
,
1
]],
[
-
1
,
args
.
max_seq_len
,
args
.
max_seq_len
],
[
-
1
,
1
],
[
-
1
,
1
],
[
-
1
,
1
]],
dtypes
=
[
'int64'
,
'int64'
,
'int64'
,
'float'
,
'int64'
,
'int64'
,
'int64'
],
lod_levels
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
],
name
=
pyreader_name
,
use_double_buffer
=
True
)
(
src_ids
,
sent_ids
,
pos_ids
,
self_attn_mask
,
labels
,
next_sent_index
,
qids
)
=
fluid
.
layers
.
read_file
(
pyreader
)
(
src_ids
,
sent_ids
,
pos_ids
,
self_attn_mask
,
labels
,
next_sent_index
,
qids
)
=
fluid
.
layers
.
read_file
(
pyreader
)
ernie
=
ErnieModel
(
src_ids
=
src_ids
,
...
...
@@ -57,7 +55,7 @@ def create_model(args,
dropout_implementation
=
"upscale_in_train"
)
logits
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
ernie_config
[
"num_labels"
]
,
size
=
args
.
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
...
...
@@ -82,18 +80,21 @@ def create_model(args,
num_seqs
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
labels
,
total
=
num_seqs
)
graph_vars
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"accuracy"
:
accuracy
,
"labels"
:
labels
,
"num_seqs"
:
num_seqs
,
"qids"
:
qids
}
graph_vars
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"accuracy"
:
accuracy
,
"labels"
:
labels
,
"num_seqs"
:
num_seqs
,
"qids"
:
qids
}
for
k
,
v
in
graph_vars
.
items
():
v
.
persistable
=
True
v
.
persistable
=
True
return
pyreader
,
graph_vars
def
evaluate_mrr
(
preds
):
last_qid
=
None
total_mrr
=
0.0
...
...
@@ -114,6 +115,7 @@ def evaluate_mrr(preds):
return
total_mrr
/
qnum
def
evaluate_map
(
preds
):
def
singe_map
(
st
,
en
):
total_p
=
0.0
...
...
@@ -142,17 +144,18 @@ def evaluate_map(preds):
total_map
+=
singe_map
(
st
,
len
(
preds
))
return
total_map
/
qnum
def
evaluate
(
exe
,
test_program
,
test_pyreader
,
graph_vars
,
eval_phase
):
train_fetch_list
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"num_seqs"
].
name
]
train_fetch_list
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"num_seqs"
].
name
]
if
eval_phase
==
"train"
:
if
"learning_rate"
in
graph_vars
:
train_fetch_list
.
append
(
graph_vars
[
"learning_rate"
].
name
)
outputs
=
exe
.
run
(
fetch_list
=
train_fetch_list
)
ret
=
{
"loss"
:
np
.
mean
(
outputs
[
0
]),
"accuracy"
:
np
.
mean
(
outputs
[
1
])}
ret
=
{
"loss"
:
np
.
mean
(
outputs
[
0
]),
"accuracy"
:
np
.
mean
(
outputs
[
1
])}
if
"learning_rate"
in
graph_vars
:
ret
[
"learning_rate"
]
=
float
(
outputs
[
4
][
0
])
return
ret
...
...
@@ -162,22 +165,21 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
qids
,
labels
,
scores
=
[],
[],
[]
time_begin
=
time
.
time
()
fetch_list
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"probs"
].
name
,
graph_vars
[
"labels"
].
name
,
graph_vars
[
"num_seqs"
].
name
,
graph_vars
[
"qids"
].
name
]
fetch_list
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"probs"
].
name
,
graph_vars
[
"labels"
].
name
,
graph_vars
[
"num_seqs"
].
name
,
graph_vars
[
"qids"
].
name
]
while
True
:
try
:
np_loss
,
np_acc
,
np_probs
,
np_labels
,
np_num_seqs
,
np_qids
=
exe
.
run
(
program
=
test_program
,
fetch_list
=
fetch_list
)
np_loss
,
np_acc
,
np_probs
,
np_labels
,
np_num_seqs
,
np_qids
=
exe
.
run
(
program
=
test_program
,
fetch_list
=
fetch_list
)
total_cost
+=
np
.
sum
(
np_loss
*
np_num_seqs
)
total_acc
+=
np
.
sum
(
np_acc
*
np_num_seqs
)
total_num_seqs
+=
np
.
sum
(
np_num_seqs
)
labels
.
extend
(
np_labels
.
reshape
((
-
1
)).
tolist
())
qids
.
extend
(
np_qids
.
reshape
(
-
1
).
tolist
())
scores
.
extend
(
np_probs
[:,
1
].
reshape
(
-
1
).
tolist
())
scores
.
extend
(
np_probs
[:,
1
].
reshape
(
-
1
).
tolist
())
np_preds
=
np
.
argmax
(
np_probs
,
axis
=
1
).
astype
(
np
.
float32
)
total_label_pos_num
+=
np
.
sum
(
np_labels
)
total_pred_pos_num
+=
np
.
sum
(
np_preds
)
...
...
@@ -188,20 +190,23 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
time_end
=
time
.
time
()
if
len
(
qids
)
==
0
:
print
(
"[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
%
(
eval_phase
,
total_cost
/
total_num_seqs
,
total_acc
/
total_num_seqs
,
total_num_seqs
,
time_end
-
time_begin
))
print
(
"[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
%
(
eval_phase
,
total_cost
/
total_num_seqs
,
total_acc
/
total_num_seqs
,
total_num_seqs
,
time_end
-
time_begin
))
else
:
r
=
total_correct_num
/
total_label_pos_num
p
=
total_correct_num
/
total_pred_pos_num
f
=
2
*
p
*
r
/
(
p
+
r
)
assert
len
(
qids
)
==
len
(
labels
)
==
len
(
scores
)
preds
=
sorted
(
zip
(
qids
,
scores
,
labels
),
key
=
lambda
elem
:(
elem
[
0
],
-
elem
[
1
]))
preds
=
sorted
(
zip
(
qids
,
scores
,
labels
),
key
=
lambda
elem
:
(
elem
[
0
],
-
elem
[
1
]))
mrr
=
evaluate_mrr
(
preds
)
map
=
evaluate_map
(
preds
)
print
(
"[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s"
%
(
eval_phase
,
total_cost
/
total_num_seqs
,
total_acc
/
total_num_seqs
,
mrr
,
map
,
p
,
r
,
f
,
total_num_seqs
,
time_end
-
time_begin
))
print
(
"[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s"
%
(
eval_phase
,
total_cost
/
total_num_seqs
,
total_acc
/
total_num_seqs
,
mrr
,
map
,
p
,
r
,
f
,
total_num_seqs
,
time_end
-
time_begin
))
ERNIE/finetune_args.py
浏览文件 @
78489c4b
...
...
@@ -64,7 +64,7 @@ data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models."
)
data_g
.
add_arg
(
"random_seed"
,
int
,
0
,
"Random seed."
)
data_g
.
add_arg
(
"label_map_config"
,
str
,
None
,
"label_map_path."
)
data_g
.
add_arg
(
"num_labels"
,
int
,
2
,
"label number"
)
data_g
.
add_arg
(
"num_labels"
,
int
,
2
,
"label number"
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
...
...
@@ -74,3 +74,4 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g
.
add_arg
(
"do_val"
,
bool
,
True
,
"Whether to perform evaluation on dev data set."
)
run_type_g
.
add_arg
(
"do_test"
,
bool
,
True
,
"Whether to perform evaluation on test data set."
)
run_type_g
.
add_arg
(
"metrics"
,
bool
,
True
,
"Whether to perform evaluation on test data set."
)
# yapf: enable
ERNIE/pretrain_args.py
浏览文件 @
78489c4b
...
...
@@ -24,7 +24,6 @@ from utils.args import ArgumentGroup, print_arguments
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
=
argparse
.
ArgumentParser
(
__doc__
)
model_g
=
ArgumentGroup
(
parser
,
"model"
,
"model configuration and paths."
)
model_g
.
add_arg
(
"ernie_config_path"
,
str
,
"./config/ernie_config.json"
,
"Path to the json file for ernie model config."
)
model_g
.
add_arg
(
"init_checkpoint"
,
str
,
None
,
"Init checkpoint to resume training from."
)
...
...
ERNIE/reader/pretraining.py
浏览文件 @
78489c4b
...
...
@@ -30,6 +30,7 @@ import paddle.fluid as fluid
from
batching
import
prepare_batch_data
class
ErnieDataReader
(
object
):
def
__init__
(
self
,
filelist
,
...
...
@@ -81,8 +82,8 @@ class ErnieDataReader(object):
sent_ids
=
[
int
(
token
)
for
token
in
sent_ids
.
split
(
" "
)]
pos_ids
=
[
int
(
token
)
for
token
in
pos_ids
.
split
(
" "
)]
seg_labels
=
[
int
(
seg_label
)
for
seg_label
in
seg_labels
.
split
(
" "
)]
assert
len
(
token_ids
)
==
len
(
sent_ids
)
==
len
(
pos_ids
)
==
len
(
seg_labels
assert
len
(
token_ids
)
==
len
(
sent_ids
)
==
len
(
pos_ids
)
==
len
(
seg_labels
),
"[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
label
=
int
(
label
)
if
len
(
token_ids
)
>
max_seq_len
:
...
...
@@ -153,14 +154,17 @@ class ErnieDataReader(object):
if
left_len
<=
max_len
:
return
(
token_seq
[
1
:
sep_index
],
seg_labels
[
1
:
sep_index
])
else
:
return
[
token_seq
[
sep_index
+
1
:
-
1
],
seg_labels
[
sep_index
+
1
:
-
1
]]
return
[
token_seq
[
sep_index
+
1
:
-
1
],
seg_labels
[
sep_index
+
1
:
-
1
]
]
for
i
in
range
(
num_sample
):
pair_index
=
(
i
+
1
)
%
num_sample
left_tokens
,
left_seg_labels
=
split_sent
(
pos_samples
[
i
],
(
self
.
max_seq_len
-
3
)
//
2
,
self
.
sep_id
)
right_tokens
,
right_seg_labels
=
split_sent
(
pos_samples
[
pair_index
],
self
.
max_seq_len
-
3
-
len
(
left_tokens
),
self
.
sep_id
)
left_tokens
,
left_seg_labels
=
split_sent
(
pos_samples
[
i
],
(
self
.
max_seq_len
-
3
)
//
2
,
self
.
sep_id
)
right_tokens
,
right_seg_labels
=
split_sent
(
pos_samples
[
pair_index
],
self
.
max_seq_len
-
3
-
len
(
left_tokens
),
self
.
sep_id
)
token_seq
=
[
self
.
cls_id
]
+
left_tokens
+
[
self
.
sep_id
]
+
\
right_tokens
+
[
self
.
sep_id
]
...
...
ERNIE/reader/task_reader.py
浏览文件 @
78489c4b
...
...
@@ -62,7 +62,7 @@ class BaseReader(object):
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
headers
=
next
(
reader
)
Example
=
namedtuple
(
'Example'
,
headers
)
examples
=
[]
for
line
in
reader
:
example
=
Example
(
*
line
)
...
...
@@ -85,7 +85,7 @@ class BaseReader(object):
else
:
tokens_b
.
pop
()
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
"""Converts a single `Example` into a single `Record`."""
text_a
=
tokenization
.
convert_to_unicode
(
example
.
text_a
)
...
...
@@ -148,7 +148,9 @@ class BaseReader(object):
else
:
label_id
=
example
.
label
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_id'
,
'qid'
])
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_id'
,
'qid'
])
qid
=
None
if
"qid"
in
example
.
_fields
:
...
...
@@ -164,11 +166,12 @@ class BaseReader(object):
def
_prepare_batch_data
(
self
,
examples
,
batch_size
,
phase
=
None
):
"""generate batch records"""
batch_records
,
max_len
=
[],
0
batch_records
,
max_len
=
[],
0
for
index
,
example
in
enumerate
(
examples
):
if
phase
==
"train"
:
self
.
current_example
=
index
record
=
self
.
_convert_example_to_record
(
example
,
self
.
max_seq_len
,
self
.
tokenizer
)
record
=
self
.
_convert_example_to_record
(
example
,
self
.
max_seq_len
,
self
.
tokenizer
)
max_len
=
max
(
max_len
,
len
(
record
.
token_ids
))
if
self
.
in_tokens
:
to_append
=
(
len
(
batch_records
)
+
1
)
*
max_len
<=
batch_size
...
...
@@ -187,7 +190,12 @@ class BaseReader(object):
examples
=
self
.
_read_tsv
(
input_file
)
return
len
(
examples
)
def
data_generator
(
self
,
input_file
,
batch_size
,
epoch
,
shuffle
=
True
,
phase
=
None
):
def
data_generator
(
self
,
input_file
,
batch_size
,
epoch
,
shuffle
=
True
,
phase
=
None
):
examples
=
self
.
_read_tsv
(
input_file
)
def
wrapper
():
...
...
@@ -198,8 +206,10 @@ class BaseReader(object):
if
shuffle
:
np
.
random
.
shuffle
(
examples
)
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
yield
batch_data
return
wrapper
...
...
@@ -209,9 +219,11 @@ class ClassifyReader(BaseReader):
with
open
(
input_file
,
"r"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
headers
=
next
(
reader
)
text_indices
=
[
index
for
index
,
h
in
enumerate
(
headers
)
if
h
!=
"label"
]
text_indices
=
[
index
for
index
,
h
in
enumerate
(
headers
)
if
h
!=
"label"
]
Example
=
namedtuple
(
'Example'
,
headers
)
examples
=
[]
for
line
in
reader
:
for
index
,
text
in
enumerate
(
line
):
...
...
@@ -219,8 +231,8 @@ class ClassifyReader(BaseReader):
line
[
index
]
=
text
.
replace
(
' '
,
''
)
example
=
Example
(
*
line
)
examples
.
append
(
example
)
return
examples
return
examples
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
...
...
@@ -236,33 +248,50 @@ class ClassifyReader(BaseReader):
# padding
padded_token_ids
,
next_sent_index
,
self_attn_bias
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_next_sent_pos
=
True
,
return_attn_bias
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
self_attn_bias
,
batch_labels
,
next_sent_index
,
batch_qids
]
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_next_sent_pos
=
True
,
return_attn_bias
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
self_attn_bias
,
batch_labels
,
next_sent_index
,
batch_qids
]
return
return_list
class
SequenceLabelReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
batch_label_ids
=
[
record
.
label_ids
for
record
in
batch_records
]
batch_seq_lens
=
[
len
(
record
.
token_ids
)
for
record
in
batch_records
]
batch_seq_lens
=
[
len
(
record
.
token_ids
)
for
record
in
batch_records
]
# padding
padded_token_ids
,
self_attn_bias
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_next_sent_pos
=
False
,
return_attn_bias
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_label_ids
=
pad_batch_data
(
batch_label_ids
,
pad_idx
=
len
(
self
.
label_map
)
-
1
)
batch_seq_lens
=
np
.
array
(
batch_seq_lens
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
self_attn_bias
,
padded_label_ids
,
batch_seq_lens
]
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_next_sent_pos
=
False
,
return_attn_bias
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_label_ids
=
pad_batch_data
(
batch_label_ids
,
pad_idx
=
len
(
self
.
label_map
)
-
1
)
batch_seq_lens
=
np
.
array
(
batch_seq_lens
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
self_attn_bias
,
padded_label_ids
,
batch_seq_lens
]
return
return_list
def
_reseg_token_label
(
self
,
tokens
,
labels
,
tokenizer
):
...
...
@@ -285,7 +314,7 @@ class SequenceLabelReader(BaseReader):
assert
len
(
ret_tokens
)
==
len
(
ret_labels
)
return
ret_tokens
,
ret_labels
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
tokens
=
tokenization
.
convert_to_unicode
(
example
.
text_a
).
split
(
u
""
)
labels
=
tokenization
.
convert_to_unicode
(
example
.
label
).
split
(
u
""
)
tokens
,
labels
=
self
.
_reseg_token_label
(
tokens
,
labels
,
tokenizer
)
...
...
@@ -297,11 +326,15 @@ class SequenceLabelReader(BaseReader):
tokens
=
[
"[CLS]"
]
+
tokens
+
[
"[SEP]"
]
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
text_type_ids
=
[
0
]
*
len
(
token_ids
)
text_type_ids
=
[
0
]
*
len
(
token_ids
)
no_entity_id
=
len
(
self
.
label_map
)
-
1
label_ids
=
[
no_entity_id
]
+
[
self
.
label_map
[
label
]
for
label
in
labels
]
+
[
no_entity_id
]
label_ids
=
[
no_entity_id
]
+
[
self
.
label_map
[
label
]
for
label
in
labels
]
+
[
no_entity_id
]
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_ids'
])
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_ids'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
...
...
@@ -309,5 +342,6 @@ class SequenceLabelReader(BaseReader):
label_ids
=
label_ids
)
return
record
if
__name__
==
'__main__'
:
pass
ERNIE/run_classifier.py
浏览文件 @
78489c4b
...
...
@@ -32,11 +32,11 @@ from finetune.classifier import create_model, evaluate
from
optimization
import
optimization
from
utils.args
import
ArgumentGroup
,
print_arguments
from
utils.init
import
init_pretraining_params
,
init_checkpoint
from
finetune_args
import
parser
from
finetune_args
import
parser
args
=
parser
.
parse_args
()
def
main
(
args
):
ernie_config
=
ErnieConfig
(
args
.
ernie_config_path
)
ernie_config
.
print_config
()
...
...
@@ -49,12 +49,13 @@ def main(args):
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exe
=
fluid
.
Executor
(
place
)
reader
=
task_reader
.
ClassifyReader
(
vocab_path
=
args
.
vocab_path
,
label_map_config
=
args
.
label_map_config
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
random_seed
=
args
.
random_seed
)
reader
=
task_reader
.
ClassifyReader
(
vocab_path
=
args
.
vocab_path
,
label_map_config
=
args
.
label_map_config
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
random_seed
=
args
.
random_seed
)
if
not
(
args
.
do_train
or
args
.
do_val
or
args
.
do_test
):
raise
ValueError
(
"For args `do_train`, `do_val` and `do_test`, at "
...
...
@@ -108,10 +109,11 @@ def main(args):
fluid
.
memory_optimize
(
input_program
=
train_program
,
skip_opt_set
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"probs"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"num_seqs"
].
name
,
skip_opt_set
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"probs"
].
name
,
graph_vars
[
"accuracy"
].
name
,
graph_vars
[
"num_seqs"
].
name
,
])
if
args
.
verbose
:
...
...
@@ -201,7 +203,8 @@ def main(args):
if
steps
%
args
.
skip_steps
!=
0
:
train_exe
.
run
(
fetch_list
=
[])
else
:
outputs
=
evaluate
(
train_exe
,
train_program
,
train_pyreader
,
graph_vars
,
"train"
)
outputs
=
evaluate
(
train_exe
,
train_program
,
train_pyreader
,
graph_vars
,
"train"
)
if
args
.
verbose
:
verbose
=
"train pyreader queue size: %d, "
%
train_pyreader
.
queue
.
size
(
...
...
@@ -217,7 +220,8 @@ def main(args):
print
(
"epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s"
%
(
current_epoch
,
current_example
,
num_train_examples
,
steps
,
outputs
[
"loss"
],
outputs
[
"accuracy"
],
args
.
skip_steps
/
used_time
))
steps
,
outputs
[
"loss"
],
outputs
[
"accuracy"
],
args
.
skip_steps
/
used_time
))
time_begin
=
time
.
time
()
if
steps
%
args
.
save_steps
==
0
:
...
...
@@ -254,7 +258,9 @@ def main(args):
if
args
.
do_val
:
test_pyreader
.
decorate_tensor_provider
(
reader
.
data_generator
(
args
.
dev_set
,
batch_size
=
args
.
batch_size
,
epoch
=
1
,
args
.
dev_set
,
batch_size
=
args
.
batch_size
,
epoch
=
1
,
shuffle
=
False
))
print
(
"Final validation result:"
)
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
"dev"
)
...
...
@@ -273,4 +279,5 @@ def main(args):
if
__name__
==
'__main__'
:
print_arguments
(
args
)
main
(
args
)
ERNIE/run_sequence_labeling.py
浏览文件 @
78489c4b
...
...
@@ -30,12 +30,12 @@ from model.ernie import ErnieConfig
from
optimization
import
optimization
from
utils.init
import
init_pretraining_params
,
init_checkpoint
from
utils.args
import
print_arguments
from
finetune.sequence_label
import
create_model
,
evaluate
from
finetune_args
import
parser
from
finetune.sequence_label
import
create_model
,
evaluate
from
finetune_args
import
parser
args
=
parser
.
parse_args
()
def
main
(
args
):
ernie_config
=
ErnieConfig
(
args
.
ernie_config_path
)
ernie_config
.
print_config
()
...
...
@@ -48,12 +48,13 @@ def main(args):
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exe
=
fluid
.
Executor
(
place
)
reader
=
task_reader
.
SequenceLabelReader
(
vocab_path
=
args
.
vocab_path
,
label_map_config
=
args
.
label_map_config
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
random_seed
=
args
.
random_seed
)
reader
=
task_reader
.
SequenceLabelReader
(
vocab_path
=
args
.
vocab_path
,
label_map_config
=
args
.
label_map_config
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
random_seed
=
args
.
random_seed
)
if
not
(
args
.
do_train
or
args
.
do_val
or
args
.
do_test
):
raise
ValueError
(
"For args `do_train`, `do_val` and `do_test`, at "
...
...
@@ -107,10 +108,9 @@ def main(args):
fluid
.
memory_optimize
(
input_program
=
train_program
,
skip_opt_set
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"labels"
].
name
,
graph_vars
[
"infers"
].
name
,
graph_vars
[
"seq_lens"
].
name
skip_opt_set
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"labels"
].
name
,
graph_vars
[
"infers"
].
name
,
graph_vars
[
"seq_lens"
].
name
])
if
args
.
verbose
:
...
...
@@ -200,24 +200,26 @@ def main(args):
if
steps
%
args
.
skip_steps
!=
0
:
train_exe
.
run
(
fetch_list
=
[])
else
:
outputs
=
evaluate
(
train_exe
,
train_program
,
train_pyreader
,
graph_vars
,
args
.
num_labels
,
"train"
,
dev_count
)
outputs
=
evaluate
(
train_exe
,
train_program
,
train_pyreader
,
graph_vars
,
args
.
num_labels
,
"train"
,
dev_count
)
if
args
.
verbose
:
verbose
=
"train pyreader queue size: %d, "
%
train_pyreader
.
queue
.
size
(
)
verbose
+=
"learning rate: %f"
%
(
outputs
[
"lr"
]
if
warmup_steps
>
0
else
args
.
learning_rate
)
outputs
[
"lr"
]
if
warmup_steps
>
0
else
args
.
learning_rate
)
print
(
verbose
)
current_example
,
current_epoch
=
reader
.
get_train_progress
(
)
current_example
,
current_epoch
=
reader
.
get_train_progress
()
time_end
=
time
.
time
()
used_time
=
time_end
-
time_begin
print
(
"epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"f1: %f, precision: %f, recall: %f, speed: %f steps/s"
%
(
current_epoch
,
current_example
,
num_train_examples
,
steps
,
outputs
[
"loss"
],
outputs
[
"f1"
],
outputs
[
"precision"
],
outputs
[
"recall"
],
args
.
skip_steps
/
used_time
))
"f1: %f, precision: %f, recall: %f, speed: %f steps/s"
%
(
current_epoch
,
current_example
,
num_train_examples
,
steps
,
outputs
[
"loss"
],
outputs
[
"f1"
],
outputs
[
"precision"
],
outputs
[
"recall"
],
args
.
skip_steps
/
used_time
))
time_begin
=
time
.
time
()
if
steps
%
args
.
save_steps
==
0
:
...
...
@@ -234,7 +236,8 @@ def main(args):
batch_size
=
args
.
batch_size
,
epoch
=
1
,
shuffle
=
False
))
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
args
.
num_labels
,
"dev"
)
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
args
.
num_labels
,
"dev"
)
# evaluate test set
if
args
.
do_test
:
test_pyreader
.
decorate_tensor_provider
(
...
...
@@ -243,7 +246,8 @@ def main(args):
batch_size
=
args
.
batch_size
,
epoch
=
1
,
shuffle
=
False
))
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
args
.
num_labels
,
"test"
)
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
args
.
num_labels
,
"test"
)
except
fluid
.
core
.
EOFException
:
save_path
=
os
.
path
.
join
(
args
.
checkpoints
,
"step_"
+
str
(
steps
))
...
...
@@ -255,7 +259,9 @@ def main(args):
if
args
.
do_val
:
test_pyreader
.
decorate_tensor_provider
(
reader
.
data_generator
(
args
.
dev_set
,
batch_size
=
args
.
batch_size
,
epoch
=
1
,
args
.
dev_set
,
batch_size
=
args
.
batch_size
,
epoch
=
1
,
shuffle
=
False
))
print
(
"Final validation result:"
)
evaluate
(
exe
,
test_prog
,
test_pyreader
,
graph_vars
,
args
.
num_labels
,
"dev"
)
...
...
ERNIE/train.py
浏览文件 @
78489c4b
...
...
@@ -35,8 +35,10 @@ from utils.init import init_checkpoint, init_pretraining_params
from
pretrain_args
import
parser
args
=
parser
.
parse_args
()
# yapf: enable.
def
create_model
(
pyreader_name
,
ernie_config
):
pyreader
=
fluid
.
layers
.
py_reader
(
capacity
=
70
,
...
...
@@ -224,8 +226,7 @@ def train(args):
print
(
"train_id == 0, sleep 60s"
)
time
.
sleep
(
60
)
print
(
"worker_endpoints:{} trainers_num:{} current_endpoint:{}
\
trainer_id:{}"
.
format
(
worker_endpoints
,
trainers_num
,
trainer_id:{}"
.
format
(
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
))
# prepare nccl2 env.
...
...
@@ -319,13 +320,14 @@ def train(args):
epoch
,
current_file_index
,
total_file
,
current_file
,
mask_type
=
data_reader
.
get_progress
(
)
print
(
"current learning_rate:%f"
%
np_lr
[
0
])
print
(
"epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
%
(
epoch
,
current_file_index
,
total_file
,
steps
,
np
.
mean
(
np
.
array
(
cost
)),
np
.
mean
(
np
.
exp
(
np
.
array
(
lm_cost
))),
np
.
mean
(
np
.
array
(
acc
)),
skip_steps
/
used_time
,
current_file
,
mask_type
))
print
(
"epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
%
(
epoch
,
current_file_index
,
total_file
,
steps
,
np
.
mean
(
np
.
array
(
cost
)),
np
.
mean
(
np
.
exp
(
np
.
array
(
lm_cost
))),
np
.
mean
(
np
.
array
(
acc
)),
skip_steps
/
used_time
,
current_file
,
mask_type
))
cost
=
[]
lm_cost
=
[]
acc
=
[]
...
...
@@ -341,8 +343,7 @@ def train(args):
print
(
"[validation_set] epoch: %d, step: %d, "
"loss: %f, global ppl: %f, batch-averged ppl: %f, "
"next_sent_acc: %f, speed: %f steps/s"
%
(
epoch
,
steps
,
np
.
mean
(
np
.
array
(
vali_cost
)
/
vali_steps
),
(
epoch
,
steps
,
np
.
mean
(
np
.
array
(
vali_cost
)
/
vali_steps
),
np
.
exp
(
np
.
mean
(
np
.
array
(
vali_lm_cost
)
/
vali_steps
)),
np
.
mean
(
np
.
exp
(
np
.
array
(
vali_lm_cost
)
/
vali_steps
)),
np
.
mean
(
np
.
array
(
vali_acc
)
/
vali_steps
),
vali_speed
))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录