Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
ERNIE
提交
f889492f
E
ERNIE
项目概览
PaddlePaddle
/
ERNIE
1 年多 前同步成功
通知
111
Star
5997
Fork
1271
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
29
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
ERNIE
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
29
Issue
29
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f889492f
编写于
11月 20, 2019
作者:
C
chenxuyi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
examples compat to ERNIE tiny
上级
72e21235
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
75 addition
and
21 deletion
+75
-21
ernie/utils/data.py
ernie/utils/data.py
+44
-2
example/finetune_classifier.py
example/finetune_classifier.py
+10
-3
example/finetune_ner.py
example/finetune_ner.py
+15
-14
example/finetune_ranker.py
example/finetune_ranker.py
+6
-2
未找到文件。
ernie/utils/data.py
浏览文件 @
f889492f
...
...
@@ -4,6 +4,7 @@ import re
from
propeller
import
log
import
itertools
from
propeller.paddle.data
import
Dataset
import
pickle
import
six
...
...
@@ -101,7 +102,7 @@ class SpaceTokenizer(object):
class
CharTokenizer
(
object
):
def
__init__
(
self
,
vocab
,
lower
=
True
):
def
__init__
(
self
,
vocab
,
lower
=
True
,
sentencepiece_style_vocab
=
False
):
"""
char tokenizer (wordpiece english)
normed txt(space seperated or not) => list of word-piece
...
...
@@ -110,6 +111,7 @@ class CharTokenizer(object):
#self.pat = re.compile(r'([,.!?\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]|[\u4e00-\u9fa5]|[a-zA-Z0-9]+)')
self
.
pat
=
re
.
compile
(
r
'([a-zA-Z0-9]+|\S)'
)
self
.
lower
=
lower
self
.
sentencepiece_style_vocab
=
sentencepiece_style_vocab
def
__call__
(
self
,
sen
):
if
len
(
sen
)
==
0
:
...
...
@@ -119,11 +121,51 @@ class CharTokenizer(object):
sen
=
sen
.
lower
()
res
=
[]
for
match
in
self
.
pat
.
finditer
(
sen
):
words
,
_
=
wordpiece
(
match
.
group
(
0
),
vocab
=
self
.
vocab
,
unk_token
=
'[UNK]'
)
words
,
_
=
wordpiece
(
match
.
group
(
0
),
vocab
=
self
.
vocab
,
unk_token
=
'[UNK]'
,
sentencepiece_style_vocab
=
self
.
sentencepiece_style_vocab
)
res
.
extend
(
words
)
return
res
class
WSSPTokenizer
(
object
):
def
__init__
(
self
,
sp_model_dir
,
word_dict
,
ws
=
True
,
lower
=
True
):
self
.
ws
=
ws
self
.
lower
=
lower
self
.
dict
=
pickle
.
load
(
open
(
word_dict
,
'rb'
),
encoding
=
'utf8'
)
import
sentencepiece
as
spm
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
window_size
=
5
self
.
sp_model
.
Load
(
sp_model_dir
)
def
cut
(
self
,
chars
):
words
=
[]
idx
=
0
while
idx
<
len
(
chars
):
matched
=
False
for
i
in
range
(
self
.
window_size
,
0
,
-
1
):
cand
=
chars
[
idx
:
idx
+
i
]
if
cand
in
self
.
dict
:
words
.
append
(
cand
)
matched
=
True
break
if
not
matched
:
i
=
1
words
.
append
(
chars
[
idx
])
idx
+=
i
return
words
def
__call__
(
self
,
sen
):
sen
=
sen
.
decode
(
'utf8'
)
if
self
.
ws
:
sen
=
[
s
for
s
in
self
.
cut
(
sen
)
if
s
!=
' '
]
else
:
sen
=
sen
.
split
(
' '
)
if
self
.
lower
:
sen
=
[
s
.
lower
()
for
s
in
sen
]
sen
=
' '
.
join
(
sen
)
ret
=
self
.
sp_model
.
EncodeAsPieces
(
sen
)
return
ret
def
build_2_pair
(
seg_a
,
seg_b
,
max_seqlen
,
cls_id
,
sep_id
):
token_type_a
=
np
.
ones_like
(
seg_a
,
dtype
=
np
.
int64
)
*
0
token_type_b
=
np
.
ones_like
(
seg_b
,
dtype
=
np
.
int64
)
*
1
...
...
example/finetune_classifier.py
浏览文件 @
f889492f
...
...
@@ -55,7 +55,7 @@ class ClassificationErnieModel(propeller.train.Model):
pos_ids
=
L
.
cast
(
pos_ids
,
'int64'
)
pos_ids
.
stop_gradient
=
True
input_mask
.
stop_gradient
=
True
task_ids
=
L
.
zeros_like
(
src_ids
)
+
self
.
hparam
.
task_id
#this shit wont use at the moment
task_ids
=
L
.
zeros_like
(
src_ids
)
+
self
.
hparam
.
task_id
task_ids
.
stop_gradient
=
True
ernie
=
ErnieModel
(
...
...
@@ -128,6 +128,8 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--vocab_file'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--do_predict'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--warm_start_from'
,
type
=
str
)
parser
.
add_argument
(
'--sentence_piece_model'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--word_dict'
,
type
=
str
,
default
=
None
)
args
=
parser
.
parse_args
()
run_config
=
propeller
.
parse_runconfig
(
args
)
hparams
=
propeller
.
parse_hparam
(
args
)
...
...
@@ -138,7 +140,12 @@ if __name__ == '__main__':
cls_id
=
vocab
[
'[CLS]'
]
unk_id
=
vocab
[
'[UNK]'
]
tokenizer
=
utils
.
data
.
CharTokenizer
(
vocab
.
keys
())
if
args
.
sentence_piece_model
is
not
None
:
if
args
.
word_dict
is
None
:
raise
ValueError
(
'--word_dict no specified in subword Model'
)
tokenizer
=
utils
.
data
.
WSSPTokenizer
(
args
.
sentence_piece_model
,
args
.
word_dict
,
ws
=
True
,
lower
=
True
)
else
:
tokenizer
=
utils
.
data
.
CharTokenizer
(
vocab
.
keys
())
def
tokenizer_func
(
inputs
):
'''avoid pickle error'''
...
...
@@ -179,7 +186,7 @@ if __name__ == '__main__':
dev_ds
.
data_shapes
=
shapes
dev_ds
.
data_types
=
types
varname_to_warmstart
=
re
.
compile
(
'encoder.*|pooled.*|.*embedding|pre_encoder_.*
'
)
varname_to_warmstart
=
re
.
compile
(
r
'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$
'
)
warm_start_dir
=
args
.
warm_start_from
ws
=
propeller
.
WarmStartSetting
(
predicate_fn
=
lambda
v
:
varname_to_warmstart
.
match
(
v
.
name
)
and
os
.
path
.
exists
(
os
.
path
.
join
(
warm_start_dir
,
v
.
name
)),
...
...
example/finetune_ner.py
浏览文件 @
f889492f
...
...
@@ -32,7 +32,6 @@ import paddle.fluid.layers as L
from
model.ernie
import
ErnieModel
from
optimization
import
optimization
import
tokenization
import
utils.data
from
propeller
import
log
...
...
@@ -121,7 +120,7 @@ class SequenceLabelErnieModel(propeller.train.Model):
def
make_sequence_label_dataset
(
name
,
input_files
,
label_list
,
tokenizer
,
batch_size
,
max_seqlen
,
is_train
):
label_map
=
{
v
:
i
for
i
,
v
in
enumerate
(
label_list
)}
no_entity_id
=
label_map
[
'O'
]
delimiter
=
''
delimiter
=
b
''
def
read_bio_data
(
filename
):
ds
=
propeller
.
data
.
Dataset
.
from_file
(
filename
)
...
...
@@ -132,10 +131,10 @@ def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_
while
1
:
line
=
next
(
iterator
)
cols
=
line
.
rstrip
(
b
'
\n
'
).
split
(
b
'
\t
'
)
tokens
=
cols
[
0
].
split
(
delimiter
)
labels
=
cols
[
1
].
split
(
delimiter
)
if
len
(
cols
)
!=
2
:
continue
tokens
=
tokenization
.
convert_to_unicode
(
cols
[
0
]).
split
(
delimiter
)
labels
=
tokenization
.
convert_to_unicode
(
cols
[
1
]).
split
(
delimiter
)
if
len
(
tokens
)
!=
len
(
labels
)
or
len
(
tokens
)
==
0
:
continue
yield
[
tokens
,
labels
]
...
...
@@ -151,7 +150,8 @@ def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_
ret_tokens
=
[]
ret_labels
=
[]
for
token
,
label
in
zip
(
tokens
,
labels
):
sub_token
=
tokenizer
.
tokenize
(
token
)
sub_token
=
tokenizer
(
token
)
label
=
label
.
decode
(
'utf8'
)
if
len
(
sub_token
)
==
0
:
continue
ret_tokens
.
extend
(
sub_token
)
...
...
@@ -179,7 +179,7 @@ def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_
labels
=
labels
[:
max_seqlen
-
2
]
tokens
=
[
'[CLS]'
]
+
tokens
+
[
'[SEP]'
]
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
token_ids
=
[
vocab
[
t
]
for
t
in
tokens
]
label_ids
=
[
no_entity_id
]
+
[
label_map
[
x
]
for
x
in
labels
]
+
[
no_entity_id
]
token_type_ids
=
[
0
]
*
len
(
token_ids
)
input_seqlen
=
len
(
token_ids
)
...
...
@@ -211,7 +211,7 @@ def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_
def
make_sequence_label_dataset_from_stdin
(
name
,
tokenizer
,
batch_size
,
max_seqlen
):
delimiter
=
''
delimiter
=
b
''
def
stdin_gen
():
if
six
.
PY3
:
...
...
@@ -232,9 +232,9 @@ def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seql
while
1
:
line
,
=
next
(
iterator
)
cols
=
line
.
rstrip
(
b
'
\n
'
).
split
(
b
'
\t
'
)
tokens
=
cols
[
0
].
split
(
delimiter
)
if
len
(
cols
)
!=
1
:
continue
tokens
=
tokenization
.
convert_to_unicode
(
cols
[
0
]).
split
(
delimiter
)
if
len
(
tokens
)
==
0
:
continue
yield
tokens
,
...
...
@@ -247,7 +247,7 @@ def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seql
tokens
,
=
next
(
iterator
)
ret_tokens
=
[]
for
token
in
tokens
:
sub_token
=
tokenizer
.
tokenize
(
token
)
sub_token
=
tokenizer
(
token
)
if
len
(
sub_token
)
==
0
:
continue
ret_tokens
.
extend
(
sub_token
)
...
...
@@ -266,7 +266,7 @@ def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seql
tokens
=
tokens
[:
max_seqlen
-
2
]
tokens
=
[
'[CLS]'
]
+
tokens
+
[
'[SEP]'
]
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
token_ids
=
[
vocab
[
t
]
for
t
in
tokens
]
token_type_ids
=
[
0
]
*
len
(
token_ids
)
input_seqlen
=
len
(
token_ids
)
...
...
@@ -296,13 +296,15 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--vocab_file'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--do_predict'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_sentence_piece_vocab'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--warm_start_from'
,
type
=
str
)
args
=
parser
.
parse_args
()
run_config
=
propeller
.
parse_runconfig
(
args
)
hparams
=
propeller
.
parse_hparam
(
args
)
tokenizer
=
tokenization
.
FullTokenizer
(
args
.
vocab_file
)
vocab
=
tokenizer
.
vocab
vocab
=
{
j
.
strip
().
split
(
'
\t
'
)[
0
]:
i
for
i
,
j
in
enumerate
(
open
(
args
.
vocab_file
,
'r'
,
encoding
=
'utf8'
))}
tokenizer
=
utils
.
data
.
CharTokenizer
(
vocab
,
sentencepiece_style_vocab
=
args
.
use_sentence_piece_vocab
)
sep_id
=
vocab
[
'[SEP]'
]
cls_id
=
vocab
[
'[CLS]'
]
unk_id
=
vocab
[
'[UNK]'
]
...
...
@@ -358,7 +360,7 @@ if __name__ == '__main__':
from_dir
=
warm_start_dir
)
best_exporter
=
propeller
.
train
.
exporter
.
BestExporter
(
os
.
path
.
join
(
run_config
.
model_dir
,
'best'
),
cmp_fn
=
lambda
old
,
new
:
new
[
'dev'
][
'f1'
]
>
old
[
'dev'
][
'f1'
])
best_exporter
=
propeller
.
train
.
exporter
.
Best
InferenceModel
Exporter
(
os
.
path
.
join
(
run_config
.
model_dir
,
'best'
),
cmp_fn
=
lambda
old
,
new
:
new
[
'dev'
][
'f1'
]
>
old
[
'dev'
][
'f1'
])
propeller
.
train
.
train_and_eval
(
model_class_or_model_fn
=
SequenceLabelErnieModel
,
params
=
hparams
,
...
...
@@ -387,7 +389,6 @@ if __name__ == '__main__':
predict_ds
.
data_types
=
types
rev_label_map
=
{
i
:
v
for
i
,
v
in
enumerate
(
label_list
)}
best_exporter
=
propeller
.
train
.
exporter
.
BestExporter
(
os
.
path
.
join
(
run_config
.
model_dir
,
'best'
),
cmp_fn
=
lambda
old
,
new
:
new
[
'dev'
][
'f1'
]
>
old
[
'dev'
][
'f1'
])
learner
=
propeller
.
Learner
(
SequenceLabelErnieModel
,
run_config
,
hparams
)
for
pred
,
_
in
learner
.
predict
(
predict_ds
,
ckpt
=-
1
):
pred_str
=
' '
.
join
([
rev_label_map
[
idx
]
for
idx
in
np
.
argmax
(
pred
,
1
).
tolist
()])
...
...
example/finetune_ranker.py
浏览文件 @
f889492f
...
...
@@ -146,6 +146,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--warm_start_from'
,
type
=
str
)
parser
.
add_argument
(
'--sentence_piece_model'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--word_dict'
,
type
=
str
,
default
=
None
)
args
=
parser
.
parse_args
()
run_config
=
propeller
.
parse_runconfig
(
args
)
hparams
=
propeller
.
parse_hparam
(
args
)
...
...
@@ -157,7 +158,9 @@ if __name__ == '__main__':
unk_id
=
vocab
[
'[UNK]'
]
if
args
.
sentence_piece_model
is
not
None
:
tokenizer
=
utils
.
data
.
JBSPTokenizer
(
args
.
sentence_piece_model
,
jb
=
True
,
lower
=
True
)
if
args
.
word_dict
is
None
:
raise
ValueError
(
'--word_dict no specified in subword Model'
)
tokenizer
=
utils
.
data
.
WSSPTokenizer
(
args
.
sentence_piece_model
,
args
.
word_dict
,
ws
=
True
,
lower
=
True
)
else
:
tokenizer
=
utils
.
data
.
CharTokenizer
(
vocab
.
keys
())
...
...
@@ -218,7 +221,7 @@ if __name__ == '__main__':
from_dir
=
warm_start_dir
)
best_exporter
=
propeller
.
train
.
exporter
.
BestExporter
(
os
.
path
.
join
(
run_config
.
model_dir
,
'best'
),
cmp_fn
=
lambda
old
,
new
:
new
[
'dev'
][
'f1'
]
>
old
[
'dev'
][
'f1'
])
best_exporter
=
propeller
.
train
.
exporter
.
Best
InferenceModel
Exporter
(
os
.
path
.
join
(
run_config
.
model_dir
,
'best'
),
cmp_fn
=
lambda
old
,
new
:
new
[
'dev'
][
'f1'
]
>
old
[
'dev'
][
'f1'
])
propeller
.
train_and_eval
(
model_class_or_model_fn
=
RankingErnieModel
,
params
=
hparams
,
...
...
@@ -258,6 +261,7 @@ if __name__ == '__main__':
est
=
propeller
.
Learner
(
RankingErnieModel
,
run_config
,
hparams
)
for
qid
,
res
in
est
.
predict
(
predict_ds
,
ckpt
=-
1
):
print
(
'%d
\t
%d
\t
%.5f
\t
%.5f'
%
(
qid
[
0
],
np
.
argmax
(
res
),
res
[
0
],
res
[
1
]))
#for i in predict_ds:
# sen = i[0]
# for ss in np.squeeze(sen):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录