Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
ERNIE
提交
815159f4
E
ERNIE
项目概览
PaddlePaddle
/
ERNIE
大约 1 年 前同步成功
通知
109
Star
5997
Fork
1270
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
29
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
ERNIE
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
29
Issue
29
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
815159f4
编写于
8月 19, 2019
作者:
T
tianxin
提交者:
GitHub
8月 19, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #287 from zhanghan1992/develop
support IO,IOB,IOE,IOBES for sequence labeling & fix en mrc
上级
5c8c3e3e
80e9ab69
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
61 addition
and
50 deletion
+61
-50
finetune/sequence_label.py
finetune/sequence_label.py
+24
-14
finetune_args.py
finetune_args.py
+1
-0
reader/task_reader.py
reader/task_reader.py
+28
-20
run_mrc.py
run_mrc.py
+0
-8
run_sequence_labeling.py
run_sequence_labeling.py
+0
-7
script/zh_task/ernie_base/run_msra_ner.sh
script/zh_task/ernie_base/run_msra_ner.sh
+1
-0
script/zh_task/ernie_large/run_msra_ner.sh
script/zh_task/ernie_large/run_msra_ner.sh
+1
-0
tokenization.py
tokenization.py
+6
-1
未找到文件。
finetune/sequence_label.py
浏览文件 @
815159f4
...
@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
...
@@ -68,11 +68,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
bias_attr
=
fluid
.
ParamAttr
(
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_seq_label_out_b"
,
name
=
"cls_seq_label_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
infers
=
fluid
.
layers
.
argmax
(
logits
,
axis
=
2
)
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
labels
,
shape
=
[
-
1
,
1
])
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
labels
,
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
reshape
(
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
infers
,
shape
=
[
-
1
,
1
])
x
=
fluid
.
layers
.
argmax
(
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
lod_labels
=
fluid
.
layers
.
sequence_unpad
(
labels
,
seq_lens
)
lod_infers
=
fluid
.
layers
.
sequence_unpad
(
infers
,
seq_lens
)
(
_
,
_
,
_
,
num_infer
,
num_label
,
num_correct
)
=
fluid
.
layers
.
chunk_eval
(
input
=
lod_infers
,
label
=
lod_labels
,
chunk_scheme
=
args
.
chunk_scheme
,
num_chunk_types
=
((
args
.
num_labels
-
1
)
//
(
len
(
args
.
chunk_scheme
)
-
1
)))
labels
=
fluid
.
layers
.
flatten
(
labels
,
axis
=
2
)
labels
=
fluid
.
layers
.
flatten
(
labels
,
axis
=
2
)
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
...
@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
...
@@ -92,6 +100,9 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
"probs"
:
probs
,
"probs"
:
probs
,
"labels"
:
ret_labels
,
"labels"
:
ret_labels
,
"infers"
:
ret_infers
,
"infers"
:
ret_infers
,
"num_infer"
:
num_infer
,
"num_label"
:
num_label
,
"num_correct"
:
num_correct
,
"seq_lens"
:
seq_lens
"seq_lens"
:
seq_lens
}
}
...
@@ -212,8 +223,8 @@ def evaluate(exe,
...
@@ -212,8 +223,8 @@ def evaluate(exe,
eval_phase
,
eval_phase
,
dev_count
=
1
):
dev_count
=
1
):
fetch_list
=
[
fetch_list
=
[
graph_vars
[
"
labels"
].
name
,
graph_vars
[
"infers
"
].
name
,
graph_vars
[
"
num_infer"
].
name
,
graph_vars
[
"num_label
"
].
name
,
graph_vars
[
"
seq_lens
"
].
name
graph_vars
[
"
num_correct
"
].
name
]
]
if
eval_phase
==
"train"
:
if
eval_phase
==
"train"
:
...
@@ -221,9 +232,10 @@ def evaluate(exe,
...
@@ -221,9 +232,10 @@ def evaluate(exe,
if
"learning_rate"
in
graph_vars
:
if
"learning_rate"
in
graph_vars
:
fetch_list
.
append
(
graph_vars
[
"learning_rate"
].
name
)
fetch_list
.
append
(
graph_vars
[
"learning_rate"
].
name
)
outputs
=
exe
.
run
(
fetch_list
=
fetch_list
)
outputs
=
exe
.
run
(
fetch_list
=
fetch_list
)
np_labels
,
np_infers
,
np_lens
,
np_loss
=
outputs
[:
4
]
np_num_infer
,
np_num_label
,
np_num_correct
,
np_loss
=
outputs
[:
4
]
num_label
,
num_infer
,
num_correct
=
chunk_eval
(
num_label
=
np
.
sum
(
np_num_label
)
np_labels
,
np_infers
,
np_lens
,
tag_num
,
dev_count
)
num_infer
=
np
.
sum
(
np_num_infer
)
num_correct
=
np
.
sum
(
np_num_correct
)
precision
,
recall
,
f1
=
calculate_f1
(
num_label
,
num_infer
,
num_correct
)
precision
,
recall
,
f1
=
calculate_f1
(
num_label
,
num_infer
,
num_correct
)
rets
=
{
rets
=
{
"precision"
:
precision
,
"precision"
:
precision
,
...
@@ -241,13 +253,11 @@ def evaluate(exe,
...
@@ -241,13 +253,11 @@ def evaluate(exe,
pyreader
.
start
()
pyreader
.
start
()
while
True
:
while
True
:
try
:
try
:
np_
labels
,
np_infers
,
np_lens
=
exe
.
run
(
program
=
program
,
np_
num_infer
,
np_num_label
,
np_num_correct
=
exe
.
run
(
program
=
program
,
fetch_list
=
fetch_list
)
fetch_list
=
fetch_list
)
label_num
,
infer_num
,
correct_num
=
chunk_eval
(
total_infer
+=
np
.
sum
(
np_num_infer
)
np_labels
,
np_infers
,
np_lens
,
tag_num
,
dev_count
)
total_label
+=
np
.
sum
(
np_num_label
)
total_infer
+=
infer_num
total_correct
+=
np
.
sum
(
np_num_correct
)
total_label
+=
label_num
total_correct
+=
correct_num
except
fluid
.
core
.
EOFException
:
except
fluid
.
core
.
EOFException
:
pyreader
.
reset
()
pyreader
.
reset
()
...
...
finetune_args.py
浏览文件 @
815159f4
...
@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128,
...
@@ -82,6 +82,7 @@ data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks."
)
"When splitting up a long document into chunks, how much stride to take between chunks."
)
data_g
.
add_arg
(
"n_best_size"
,
int
,
20
,
data_g
.
add_arg
(
"n_best_size"
,
int
,
20
,
"The total number of n-best predictions to generate in the nbest_predictions.json output file."
)
"The total number of n-best predictions to generate in the nbest_predictions.json output file."
)
data_g
.
add_arg
(
"chunk_scheme"
,
type
=
str
,
default
=
"IOB"
,
choices
=
[
"IO"
,
"IOB"
,
"IOE"
,
"IOBES"
],
help
=
"chunk scheme"
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
...
...
reader/task_reader.py
浏览文件 @
815159f4
...
@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader):
...
@@ -347,13 +347,23 @@ class SequenceLabelReader(BaseReader):
if
len
(
sub_token
)
==
0
:
if
len
(
sub_token
)
==
0
:
continue
continue
ret_tokens
.
extend
(
sub_token
)
ret_tokens
.
extend
(
sub_token
)
ret_labels
.
append
(
label
)
if
len
(
sub_token
)
==
1
:
if
len
(
sub_token
)
<
2
:
ret_labels
.
append
(
label
)
continue
continue
sub_label
=
label
if
label
.
startswith
(
"B-"
):
if
label
==
"O"
or
label
.
startswith
(
"I-"
):
sub_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
label
]
*
len
(
sub_token
))
ret_labels
.
extend
([
sub_label
]
*
(
len
(
sub_token
)
-
1
))
elif
label
.
startswith
(
"B-"
):
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
label
]
+
[
i_label
]
*
(
len
(
sub_token
)
-
1
))
elif
label
.
startswith
(
"S-"
):
b_laebl
=
"B-"
+
label
[
2
:]
e_label
=
"E-"
+
label
[
2
:]
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
b_laebl
]
+
[
i_label
]
*
(
len
(
sub_token
)
-
2
)
+
[
e_label
])
elif
label
.
startswith
(
"E-"
):
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
i_label
]
*
(
len
(
sub_token
)
-
1
)
+
[
label
])
assert
len
(
ret_tokens
)
==
len
(
ret_labels
)
assert
len
(
ret_tokens
)
==
len
(
ret_labels
)
return
ret_tokens
,
ret_labels
return
ret_tokens
,
ret_labels
...
@@ -451,6 +461,15 @@ class MRCReader(BaseReader):
...
@@ -451,6 +461,15 @@ class MRCReader(BaseReader):
self
.
current_epoch
=
0
self
.
current_epoch
=
0
self
.
num_examples
=
0
self
.
num_examples
=
0
self
.
Example
=
namedtuple
(
'Example'
,
[
'qas_id'
,
'question_text'
,
'doc_tokens'
,
'orig_answer_text'
,
'start_position'
,
'end_position'
])
self
.
Feature
=
namedtuple
(
"Feature"
,
[
"unique_id"
,
"example_index"
,
"doc_span_index"
,
"tokens"
,
"token_to_orig_map"
,
"token_is_max_context"
,
"token_ids"
,
"position_ids"
,
"text_type_ids"
,
"start_position"
,
"end_position"
])
self
.
DocSpan
=
namedtuple
(
"DocSpan"
,
[
"start"
,
"length"
])
def
_read_json
(
self
,
input_file
,
is_training
):
def
_read_json
(
self
,
input_file
,
is_training
):
examples
=
[]
examples
=
[]
with
open
(
input_file
,
"r"
)
as
f
:
with
open
(
input_file
,
"r"
)
as
f
:
...
@@ -495,12 +514,7 @@ class MRCReader(BaseReader):
...
@@ -495,12 +514,7 @@ class MRCReader(BaseReader):
doc_tokens
=
tokenization
.
tokenize_chinese_chars
(
doc_tokens
=
tokenization
.
tokenize_chinese_chars
(
paragraph_text
)
paragraph_text
)
Example
=
namedtuple
(
'Example'
,
[
example
=
self
.
Example
(
'qas_id'
,
'question_text'
,
'doc_tokens'
,
'orig_answer_text'
,
'start_position'
,
'end_position'
])
example
=
Example
(
qas_id
=
qas_id
,
qas_id
=
qas_id
,
question_text
=
question_text
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
doc_tokens
=
doc_tokens
,
...
@@ -544,11 +558,6 @@ class MRCReader(BaseReader):
...
@@ -544,11 +558,6 @@ class MRCReader(BaseReader):
def
_convert_example_to_feature
(
self
,
examples
,
max_seq_length
,
tokenizer
,
def
_convert_example_to_feature
(
self
,
examples
,
max_seq_length
,
tokenizer
,
is_training
):
is_training
):
Feature
=
namedtuple
(
"Feature"
,
[
"unique_id"
,
"example_index"
,
"doc_span_index"
,
"tokens"
,
"token_to_orig_map"
,
"token_is_max_context"
,
"token_ids"
,
"position_ids"
,
"text_type_ids"
,
"start_position"
,
"end_position"
])
features
=
[]
features
=
[]
unique_id
=
1000000000
unique_id
=
1000000000
...
@@ -581,14 +590,13 @@ class MRCReader(BaseReader):
...
@@ -581,14 +590,13 @@ class MRCReader(BaseReader):
tokenizer
,
example
.
orig_answer_text
)
tokenizer
,
example
.
orig_answer_text
)
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
_DocSpan
=
namedtuple
(
"DocSpan"
,
[
"start"
,
"length"
])
doc_spans
=
[]
doc_spans
=
[]
start_offset
=
0
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
length
=
max_tokens_for_doc
doc_spans
.
append
(
_
DocSpan
(
start
=
start_offset
,
length
=
length
))
doc_spans
.
append
(
self
.
DocSpan
(
start
=
start_offset
,
length
=
length
))
if
start_offset
+
length
==
len
(
all_doc_tokens
):
if
start_offset
+
length
==
len
(
all_doc_tokens
):
break
break
start_offset
+=
min
(
length
,
self
.
doc_stride
)
start_offset
+=
min
(
length
,
self
.
doc_stride
)
...
@@ -638,7 +646,7 @@ class MRCReader(BaseReader):
...
@@ -638,7 +646,7 @@ class MRCReader(BaseReader):
start_position
=
tok_start_position
-
doc_start
+
doc_offset
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
feature
=
Feature
(
feature
=
self
.
Feature
(
unique_id
=
unique_id
,
unique_id
=
unique_id
,
example_index
=
example_index
,
example_index
=
example_index
,
doc_span_index
=
doc_span_index
,
doc_span_index
=
doc_span_index
,
...
...
run_mrc.py
浏览文件 @
815159f4
...
@@ -118,14 +118,6 @@ def main(args):
...
@@ -118,14 +118,6 @@ def main(args):
weight_decay
=
args
.
weight_decay
,
weight_decay
=
args
.
weight_decay
,
scheduler
=
args
.
lr_scheduler
,
scheduler
=
args
.
lr_scheduler
,
use_fp16
=
args
.
use_fp16
)
use_fp16
=
args
.
use_fp16
)
"""
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
graph_vars["loss"].name,
graph_vars["num_seqs"].name,
])
"""
if
args
.
verbose
:
if
args
.
verbose
:
if
args
.
in_tokens
:
if
args
.
in_tokens
:
...
...
run_sequence_labeling.py
浏览文件 @
815159f4
...
@@ -109,13 +109,6 @@ def main(args):
...
@@ -109,13 +109,6 @@ def main(args):
scheduler
=
args
.
lr_scheduler
,
scheduler
=
args
.
lr_scheduler
,
use_fp16
=
args
.
use_fp16
)
use_fp16
=
args
.
use_fp16
)
fluid
.
memory_optimize
(
input_program
=
train_program
,
skip_opt_set
=
[
graph_vars
[
"loss"
].
name
,
graph_vars
[
"labels"
].
name
,
graph_vars
[
"infers"
].
name
,
graph_vars
[
"seq_lens"
].
name
])
if
args
.
verbose
:
if
args
.
verbose
:
if
args
.
in_tokens
:
if
args
.
in_tokens
:
lower_mem
,
upper_mem
,
unit
=
fluid
.
contrib
.
memory_usage
(
lower_mem
,
upper_mem
,
unit
=
fluid
.
contrib
.
memory_usage
(
...
...
script/zh_task/ernie_base/run_msra_ner.sh
浏览文件 @
815159f4
...
@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \
...
@@ -12,6 +12,7 @@ python -u run_sequence_labeling.py \
--batch_size
16
\
--batch_size
16
\
--init_pretraining_params
${
MODEL_PATH
}
/params
\
--init_pretraining_params
${
MODEL_PATH
}
/params
\
--num_labels
7
\
--num_labels
7
\
--chunk_scheme
"IOB"
\
--label_map_config
${
TASK_DATA_PATH
}
/msra_ner/label_map.json
\
--label_map_config
${
TASK_DATA_PATH
}
/msra_ner/label_map.json
\
--train_set
${
TASK_DATA_PATH
}
/msra_ner/train.tsv
\
--train_set
${
TASK_DATA_PATH
}
/msra_ner/train.tsv
\
--dev_set
${
TASK_DATA_PATH
}
/msra_ner/dev.tsv
\
--dev_set
${
TASK_DATA_PATH
}
/msra_ner/dev.tsv
\
...
...
script/zh_task/ernie_large/run_msra_ner.sh
浏览文件 @
815159f4
...
@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \
...
@@ -11,6 +11,7 @@ python -u run_sequence_labeling.py \
--batch_size
16
\
--batch_size
16
\
--init_pretraining_params
${
MODEL_PATH
}
/params
\
--init_pretraining_params
${
MODEL_PATH
}
/params
\
--num_labels
7
\
--num_labels
7
\
--chunk_scheme
"IOB"
\
--label_map_config
${
TASK_DATA_PATH
}
/msra_ner/label_map.json
\
--label_map_config
${
TASK_DATA_PATH
}
/msra_ner/label_map.json
\
--train_set
${
TASK_DATA_PATH
}
/msra_ner/train.tsv
\
--train_set
${
TASK_DATA_PATH
}
/msra_ner/train.tsv
\
--dev_set
${
TASK_DATA_PATH
}
/msra_ner/dev.tsv
\
--dev_set
${
TASK_DATA_PATH
}
/msra_ner/dev.tsv
\
...
...
tokenization.py
浏览文件 @
815159f4
...
@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text):
...
@@ -395,11 +395,16 @@ def tokenize_chinese_chars(text):
return
False
return
False
def
_is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
output
=
[]
output
=
[]
buff
=
""
buff
=
""
for
char
in
text
:
for
char
in
text
:
cp
=
ord
(
char
)
cp
=
ord
(
char
)
if
_is_chinese_char
(
cp
):
if
_is_chinese_char
(
cp
)
or
_is_whitespace
(
char
)
:
if
buff
!=
""
:
if
buff
!=
""
:
output
.
append
(
buff
)
output
.
append
(
buff
)
buff
=
""
buff
=
""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录