Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PALM
提交
dc1c43e8
P
PALM
项目概览
PaddlePaddle
/
PALM
通知
8
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
10
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PALM
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
10
Issue
10
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dc1c43e8
编写于
10月 23, 2019
作者:
X
xixiaoyao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bugs
上级
e2368644
变更
45
隐藏空白更改
内联
并排
Showing
45 changed file
with
0 addition
and
6974 deletion
+0
-6974
build/lib/paddlepalm/__init__.py
build/lib/paddlepalm/__init__.py
+0
-5
build/lib/paddlepalm/backbone/__init__.py
build/lib/paddlepalm/backbone/__init__.py
+0
-0
build/lib/paddlepalm/backbone/bert.py
build/lib/paddlepalm/backbone/bert.py
+0
-156
build/lib/paddlepalm/backbone/bow.py
build/lib/paddlepalm/backbone/bow.py
+0
-63
build/lib/paddlepalm/backbone/ernie.py
build/lib/paddlepalm/backbone/ernie.py
+0
-170
build/lib/paddlepalm/backbone/utils/__init__.py
build/lib/paddlepalm/backbone/utils/__init__.py
+0
-0
build/lib/paddlepalm/backbone/utils/transformer.py
build/lib/paddlepalm/backbone/utils/transformer.py
+0
-341
build/lib/paddlepalm/default_settings.py
build/lib/paddlepalm/default_settings.py
+0
-42
build/lib/paddlepalm/interface.py
build/lib/paddlepalm/interface.py
+0
-173
build/lib/paddlepalm/mtl_controller.py
build/lib/paddlepalm/mtl_controller.py
+0
-717
build/lib/paddlepalm/optimizer/__init__.py
build/lib/paddlepalm/optimizer/__init__.py
+0
-0
build/lib/paddlepalm/optimizer/adam.py
build/lib/paddlepalm/optimizer/adam.py
+0
-108
build/lib/paddlepalm/reader/__init__.py
build/lib/paddlepalm/reader/__init__.py
+0
-0
build/lib/paddlepalm/reader/cls4bert.py
build/lib/paddlepalm/reader/cls4bert.py
+0
-0
build/lib/paddlepalm/reader/match4ernie.py
build/lib/paddlepalm/reader/match4ernie.py
+0
-103
build/lib/paddlepalm/reader/mlm.py
build/lib/paddlepalm/reader/mlm.py
+0
-103
build/lib/paddlepalm/reader/mrc4bert.py
build/lib/paddlepalm/reader/mrc4bert.py
+0
-656
build/lib/paddlepalm/reader/mrc4ernie.py
build/lib/paddlepalm/reader/mrc4ernie.py
+0
-119
build/lib/paddlepalm/reader/utils/__init__.py
build/lib/paddlepalm/reader/utils/__init__.py
+0
-0
build/lib/paddlepalm/reader/utils/batching4bert.py
build/lib/paddlepalm/reader/utils/batching4bert.py
+0
-184
build/lib/paddlepalm/reader/utils/batching4ernie.py
build/lib/paddlepalm/reader/utils/batching4ernie.py
+0
-175
build/lib/paddlepalm/reader/utils/mlm_batching.py
build/lib/paddlepalm/reader/utils/mlm_batching.py
+0
-175
build/lib/paddlepalm/reader/utils/mrqa_helper.py
build/lib/paddlepalm/reader/utils/mrqa_helper.py
+0
-84
build/lib/paddlepalm/reader/utils/reader4ernie.py
build/lib/paddlepalm/reader/utils/reader4ernie.py
+0
-989
build/lib/paddlepalm/task_instance.py
build/lib/paddlepalm/task_instance.py
+0
-286
build/lib/paddlepalm/task_paradigm/__init__.py
build/lib/paddlepalm/task_paradigm/__init__.py
+0
-0
build/lib/paddlepalm/task_paradigm/cls.py
build/lib/paddlepalm/task_paradigm/cls.py
+0
-60
build/lib/paddlepalm/task_paradigm/match.py
build/lib/paddlepalm/task_paradigm/match.py
+0
-70
build/lib/paddlepalm/task_paradigm/mlm.py
build/lib/paddlepalm/task_paradigm/mlm.py
+0
-111
build/lib/paddlepalm/task_paradigm/mrc.py
build/lib/paddlepalm/task_paradigm/mrc.py
+0
-486
build/lib/paddlepalm/tokenizer/__init__.py
build/lib/paddlepalm/tokenizer/__init__.py
+0
-0
build/lib/paddlepalm/tokenizer/bert_tokenizer.py
build/lib/paddlepalm/tokenizer/bert_tokenizer.py
+0
-374
build/lib/paddlepalm/tokenizer/ernie_tokenizer.py
build/lib/paddlepalm/tokenizer/ernie_tokenizer.py
+0
-417
build/lib/paddlepalm/utils/__init__.py
build/lib/paddlepalm/utils/__init__.py
+0
-0
build/lib/paddlepalm/utils/config_helper.py
build/lib/paddlepalm/utils/config_helper.py
+0
-311
build/lib/paddlepalm/utils/print_helper.py
build/lib/paddlepalm/utils/print_helper.py
+0
-31
build/lib/paddlepalm/utils/reader_helper.py
build/lib/paddlepalm/utils/reader_helper.py
+0
-226
build/lib/paddlepalm/utils/saver.py
build/lib/paddlepalm/utils/saver.py
+0
-65
build/lib/paddlepalm/utils/textprocess_helper.py
build/lib/paddlepalm/utils/textprocess_helper.py
+0
-19
dist/paddle_palm-1.2-py2.7.egg
dist/paddle_palm-1.2-py2.7.egg
+0
-0
paddle_palm.egg-info/PKG-INFO
paddle_palm.egg-info/PKG-INFO
+0
-105
paddle_palm.egg-info/SOURCES.txt
paddle_palm.egg-info/SOURCES.txt
+0
-47
paddle_palm.egg-info/dependency_links.txt
paddle_palm.egg-info/dependency_links.txt
+0
-1
paddle_palm.egg-info/not-zip-safe
paddle_palm.egg-info/not-zip-safe
+0
-1
paddle_palm.egg-info/top_level.txt
paddle_palm.egg-info/top_level.txt
+0
-1
未找到文件。
build/lib/paddlepalm/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
import
sys
from
paddlepalm.mtl_controller
import
Controller
sys
.
path
.
append
(
'paddlepalm'
)
build/lib/paddlepalm/backbone/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/backbone/bert.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""v1.1
BERT model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
paddle
import
fluid
from
paddle.fluid
import
layers
from
paddlepalm.backbone.utils.transformer
import
pre_process_layer
,
encoder
from
paddlepalm.interface
import
backbone
class
Model
(
backbone
):
def
__init__
(
self
,
config
,
phase
):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self
.
_emb_size
=
config
[
"hidden_size"
]
self
.
_n_layer
=
config
[
"num_hidden_layers"
]
self
.
_n_head
=
config
[
"num_attention_heads"
]
self
.
_voc_size
=
config
[
"vocab_size"
]
self
.
_max_position_seq_len
=
config
[
"max_position_embeddings"
]
self
.
_sent_types
=
config
[
"type_vocab_size"
]
self
.
_hidden_act
=
config
[
"hidden_act"
]
self
.
_prepostprocess_dropout
=
config
[
"hidden_dropout_prob"
]
self
.
_attention_dropout
=
config
[
"attention_probs_dropout_prob"
]
self
.
model_name
=
model_name
self
.
_word_emb_name
=
self
.
model_name
+
"word_embedding"
self
.
_pos_emb_name
=
self
.
model_name
+
"pos_embedding"
self
.
_sent_emb_name
=
self
.
model_name
+
"sent_embedding"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self
.
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
config
[
"initializer_range"
])
@
property
def
inputs_attr
(
self
):
return
{
"token_ids"
:
[
-
1
,
self
.
_max_position_seq_len
,
1
],
'int64'
],
"position_ids"
:
[
-
1
,
self
.
_max_position_seq_len
,
1
],
'int64'
],
"segment_ids"
:
[
-
1
,
self
.
_max_position_seq_len
,
1
],
'int64'
],
"input_mask"
:
[
-
1
,
self
.
_max_position_seq_len
,
1
],
'float32'
]}
@
property
def
outputs_attr
(
self
):
return
{
"word_emb"
:
[
-
1
,
self
.
_max_position_seq_len
,
self
.
_emb_size
],
"sentence_emb"
:
[
-
1
,
self
.
_emb_size
],
"sentence_pair_emb"
:
[
-
1
,
self
.
_emb_size
]}
def
build
(
self
,
inputs
):
src_ids
=
inputs
[
'token_ids'
]
pos_ids
=
inputs
[
'position_ids'
]
sent_ids
=
inputs
[
'segment_ids'
]
input_mask
=
inputs
[
'input_mask'
]
# padding id in vocabulary must be set to 0
emb_out
=
layers
.
embedding
(
input
=
src_ids
,
size
=
[
self
.
_voc_size
,
self
.
_emb_size
],
dtype
=
"float32"
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_word_emb_name
,
initializer
=
self
.
_param_initializer
),
is_sparse
=
False
)
self
.
emb_out
=
emb_out
position_emb_out
=
layers
.
embedding
(
input
=
pos_ids
,
size
=
[
self
.
_max_position_seq_len
,
self
.
_emb_size
],
dtype
=
"float32"
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_pos_emb_name
,
initializer
=
self
.
_param_initializer
))
self
.
position_emb_out
=
position_emb_out
sent_emb_out
=
layers
.
embedding
(
sent_ids
,
size
=
[
self
.
_sent_types
,
self
.
_emb_size
],
dtype
=
"float32"
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_sent_emb_name
,
initializer
=
self
.
_param_initializer
))
self
.
sent_emb_out
=
sent_emb_out
emb_out
=
emb_out
+
position_emb_out
+
sent_emb_out
emb_out
=
pre_process_layer
(
emb_out
,
'nd'
,
self
.
_prepostprocess_dropout
,
name
=
'pre_encoder'
)
self_attn_mask
=
layers
.
matmul
(
x
=
input_mask
,
y
=
input_mask
,
transpose_y
=
True
)
self_attn_mask
=
layers
.
scale
(
x
=
self_attn_mask
,
scale
=
10000.0
,
bias
=
-
1.0
,
bias_after_scale
=
False
)
n_head_self_attn_mask
=
layers
.
stack
(
x
=
[
self_attn_mask
]
*
self
.
_n_head
,
axis
=
1
)
n_head_self_attn_mask
.
stop_gradient
=
True
enc_out
=
encoder
(
enc_input
=
emb_out
,
attn_bias
=
n_head_self_attn_mask
,
n_layer
=
self
.
_n_layer
,
n_head
=
self
.
_n_head
,
d_key
=
self
.
_emb_size
//
self
.
_n_head
,
d_value
=
self
.
_emb_size
//
self
.
_n_head
,
d_model
=
self
.
_emb_size
,
d_inner_hid
=
self
.
_emb_size
*
4
,
prepostprocess_dropout
=
self
.
_prepostprocess_dropout
,
attention_dropout
=
self
.
_attention_dropout
,
relu_dropout
=
0
,
hidden_act
=
self
.
_hidden_act
,
preprocess_cmd
=
""
,
postprocess_cmd
=
"dan"
,
param_initializer
=
self
.
_param_initializer
,
name
=
self
.
model_name
+
'encoder'
)
next_sent_feat
=
layers
.
slice
(
input
=
enc_out
,
axes
=
[
1
],
starts
=
[
0
],
ends
=
[
1
])
next_sent_feat
=
layers
.
fc
(
input
=
next_sent_feat
,
size
=
self
.
_emb_size
,
act
=
"tanh"
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
model_name
+
"pooled_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
"pooled_fc.b_0"
)
return
{
'word_emb'
:
enc_out
,
'sentence_emb'
:
next_sent_feat
,
'sentence_pair_emb'
:
next_sent_feat
}
def
postprocess
(
self
,
rt_outputs
):
pass
build/lib/paddlepalm/backbone/bow.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
paddle
import
fluid
from
paddle.fluid
import
layers
class
Model
(
backbone
):
def
__init__
(
self
,
config
,
phase
):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self
.
_emb_size
=
config
[
"emb_size"
]
self
.
_voc_size
=
config
[
"vocab_size"
]
@
property
def
inputs_attr
(
self
):
return
{
"token_ids"
:
[
-
1
,
self
.
_max_position_seq_len
,
1
],
'int64'
]}
@
property
def
outputs_attr
(
self
):
return
{
"word_emb"
:
[
-
1
,
self
.
_max_position_seq_len
,
self
.
_emb_size
],
"sentence_emb"
:
[
-
1
,
self
.
_emb_size
*
2
]}
def
build
(
self
,
inputs
):
tok_ids
=
inputs
[
'token_ids'
]
emb_out
=
layers
.
embedding
(
input
=
tok_ids
,
size
=
[
self
.
_voc_size
,
self
.
_emb_size
],
dtype
=
'float32'
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'word_emb'
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.1
)),
is_sparse
=
False
)
sent_emb1
=
layers
.
reduce_mean
(
emb_out
,
axis
=
1
)
sent_emb2
=
layers
.
reduce_max
(
emb_out
,
axis
=
1
)
sent_emb
=
layers
.
concat
([
sent_emb1
,
sent_emb2
],
axis
=
1
)
return
{
'word_emb'
:
emb_out
,
'sentence_emb'
:
sent_emb
}
def
postprocess
(
self
,
rt_outputs
):
pass
build/lib/paddlepalm/backbone/ernie.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
unicode_literals
from
__future__
import
absolute_import
from
paddle
import
fluid
from
paddle.fluid
import
layers
from
paddlepalm.backbone.utils.transformer
import
pre_process_layer
,
encoder
from
paddlepalm.interface
import
backbone
class
Model
(
backbone
):
def
__init__
(
self
,
config
,
phase
):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self
.
_emb_size
=
config
[
'hidden_size'
]
self
.
_n_layer
=
config
[
'num_hidden_layers'
]
self
.
_n_head
=
config
[
'num_attention_heads'
]
self
.
_voc_size
=
config
[
'vocab_size'
]
self
.
_max_position_seq_len
=
config
[
'max_position_embeddings'
]
if
config
[
'sent_type_vocab_size'
]:
self
.
_sent_types
=
config
[
'sent_type_vocab_size'
]
else
:
self
.
_sent_types
=
config
[
'type_vocab_size'
]
self
.
_task_types
=
config
[
'task_type_vocab_size'
]
self
.
_hidden_act
=
config
[
'hidden_act'
]
self
.
_prepostprocess_dropout
=
config
[
'hidden_dropout_prob'
]
self
.
_attention_dropout
=
config
[
'attention_probs_dropout_prob'
]
self
.
_word_emb_name
=
"word_embedding"
self
.
_pos_emb_name
=
"pos_embedding"
self
.
_sent_emb_name
=
"sent_embedding"
self
.
_task_emb_name
=
"task_embedding"
self
.
_emb_dtype
=
"float32"
self
.
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
config
[
'initializer_range'
])
@
property
def
inputs_attr
(
self
):
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
]}
@
property
def
outputs_attr
(
self
):
return
{
"word_embedding"
:
[[
-
1
,
-
1
,
self
.
_emb_size
],
'float32'
],
"encoder_outputs"
:
[[
-
1
,
-
1
,
self
.
_emb_size
],
'float32'
],
"sentence_embedding"
:
[[
-
1
,
self
.
_emb_size
],
'float32'
],
"sentence_pair_embedding"
:
[[
-
1
,
self
.
_emb_size
],
'float32'
]}
def
build
(
self
,
inputs
):
src_ids
=
inputs
[
'token_ids'
]
pos_ids
=
inputs
[
'position_ids'
]
sent_ids
=
inputs
[
'segment_ids'
]
input_mask
=
inputs
[
'input_mask'
]
task_ids
=
inputs
[
'task_ids'
]
# padding id in vocabulary must be set to 0
emb_out
=
fluid
.
layers
.
embedding
(
input
=
src_ids
,
size
=
[
self
.
_voc_size
,
self
.
_emb_size
],
dtype
=
self
.
_emb_dtype
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_word_emb_name
,
initializer
=
self
.
_param_initializer
),
is_sparse
=
False
)
position_emb_out
=
fluid
.
layers
.
embedding
(
input
=
pos_ids
,
size
=
[
self
.
_max_position_seq_len
,
self
.
_emb_size
],
dtype
=
self
.
_emb_dtype
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_pos_emb_name
,
initializer
=
self
.
_param_initializer
))
sent_emb_out
=
fluid
.
layers
.
embedding
(
sent_ids
,
size
=
[
self
.
_sent_types
,
self
.
_emb_size
],
dtype
=
self
.
_emb_dtype
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_sent_emb_name
,
initializer
=
self
.
_param_initializer
))
emb_out
=
emb_out
+
position_emb_out
emb_out
=
emb_out
+
sent_emb_out
task_emb_out
=
fluid
.
layers
.
embedding
(
task_ids
,
size
=
[
self
.
_task_types
,
self
.
_emb_size
],
dtype
=
self
.
_emb_dtype
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_task_emb_name
,
initializer
=
self
.
_param_initializer
))
emb_out
=
emb_out
+
task_emb_out
emb_out
=
pre_process_layer
(
emb_out
,
'nd'
,
self
.
_prepostprocess_dropout
,
name
=
'pre_encoder'
)
self_attn_mask
=
fluid
.
layers
.
matmul
(
x
=
input_mask
,
y
=
input_mask
,
transpose_y
=
True
)
self_attn_mask
=
fluid
.
layers
.
scale
(
x
=
self_attn_mask
,
scale
=
10000.0
,
bias
=-
1.0
,
bias_after_scale
=
False
)
n_head_self_attn_mask
=
fluid
.
layers
.
stack
(
x
=
[
self_attn_mask
]
*
self
.
_n_head
,
axis
=
1
)
n_head_self_attn_mask
.
stop_gradient
=
True
enc_out
=
encoder
(
enc_input
=
emb_out
,
attn_bias
=
n_head_self_attn_mask
,
n_layer
=
self
.
_n_layer
,
n_head
=
self
.
_n_head
,
d_key
=
self
.
_emb_size
//
self
.
_n_head
,
d_value
=
self
.
_emb_size
//
self
.
_n_head
,
d_model
=
self
.
_emb_size
,
d_inner_hid
=
self
.
_emb_size
*
4
,
prepostprocess_dropout
=
self
.
_prepostprocess_dropout
,
attention_dropout
=
self
.
_attention_dropout
,
relu_dropout
=
0
,
hidden_act
=
self
.
_hidden_act
,
preprocess_cmd
=
""
,
postprocess_cmd
=
"dan"
,
param_initializer
=
self
.
_param_initializer
,
name
=
'encoder'
)
next_sent_feat
=
fluid
.
layers
.
slice
(
input
=
enc_out
,
axes
=
[
1
],
starts
=
[
0
],
ends
=
[
1
])
next_sent_feat
=
fluid
.
layers
.
reshape
(
next_sent_feat
,
[
-
1
,
next_sent_feat
.
shape
[
-
1
]])
next_sent_feat
=
fluid
.
layers
.
fc
(
input
=
next_sent_feat
,
size
=
self
.
_emb_size
,
act
=
"tanh"
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"pooled_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
"pooled_fc.b_0"
)
return
{
'word_embedding'
:
emb_out
,
'encoder_outputs'
:
enc_out
,
'sentence_embedding'
:
next_sent_feat
,
'sentence_pair_embedding'
:
next_sent_feat
}
def
postprocess
(
self
,
rt_outputs
):
pass
build/lib/paddlepalm/backbone/utils/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/backbone/utils/transformer.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
functools
import
partial
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
def
multi_head_attention
(
queries
,
keys
,
values
,
attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
=
1
,
dropout_rate
=
0.
,
cache
=
None
,
param_initializer
=
None
,
name
=
'multi_head_att'
):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys
=
queries
if
keys
is
None
else
keys
values
=
keys
if
values
is
None
else
values
if
not
(
len
(
queries
.
shape
)
==
len
(
keys
.
shape
)
==
len
(
values
.
shape
)
==
3
):
raise
ValueError
(
"Inputs: quries, keys and values should all be 3-D tensors."
)
def
__compute_qkv
(
queries
,
keys
,
values
,
n_head
,
d_key
,
d_value
):
"""
Add linear projection to queries, keys, and values.
"""
q
=
layers
.
fc
(
input
=
queries
,
size
=
d_key
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_query_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_query_fc.b_0'
)
k
=
layers
.
fc
(
input
=
keys
,
size
=
d_key
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_key_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_key_fc.b_0'
)
v
=
layers
.
fc
(
input
=
values
,
size
=
d_value
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_value_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_value_fc.b_0'
)
return
q
,
k
,
v
def
__split_heads
(
x
,
n_head
):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size
=
x
.
shape
[
-
1
]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped
=
layers
.
reshape
(
x
=
x
,
shape
=
[
0
,
0
,
n_head
,
hidden_size
//
n_head
],
inplace
=
True
)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return
layers
.
transpose
(
x
=
reshaped
,
perm
=
[
0
,
2
,
1
,
3
])
def
__combine_heads
(
x
):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if
len
(
x
.
shape
)
==
3
:
return
x
if
len
(
x
.
shape
)
!=
4
:
raise
ValueError
(
"Input(x) should be a 4-D Tensor."
)
trans_x
=
layers
.
transpose
(
x
,
perm
=
[
0
,
2
,
1
,
3
])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return
layers
.
reshape
(
x
=
trans_x
,
shape
=
[
0
,
0
,
trans_x
.
shape
[
2
]
*
trans_x
.
shape
[
3
]],
inplace
=
True
)
def
scaled_dot_product_attention
(
q
,
k
,
v
,
attn_bias
,
d_key
,
dropout_rate
):
"""
Scaled Dot-Product Attention
"""
scaled_q
=
layers
.
scale
(
x
=
q
,
scale
=
d_key
**-
0.5
)
product
=
layers
.
matmul
(
x
=
scaled_q
,
y
=
k
,
transpose_y
=
True
)
if
attn_bias
:
product
+=
attn_bias
weights
=
layers
.
softmax
(
product
)
if
dropout_rate
:
weights
=
layers
.
dropout
(
weights
,
dropout_prob
=
dropout_rate
,
dropout_implementation
=
"upscale_in_train"
,
is_test
=
False
)
out
=
layers
.
matmul
(
weights
,
v
)
return
out
q
,
k
,
v
=
__compute_qkv
(
queries
,
keys
,
values
,
n_head
,
d_key
,
d_value
)
if
cache
is
not
None
:
# use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k
=
cache
[
"k"
]
=
layers
.
concat
(
[
layers
.
reshape
(
cache
[
"k"
],
shape
=
[
0
,
0
,
d_model
]),
k
],
axis
=
1
)
v
=
cache
[
"v"
]
=
layers
.
concat
(
[
layers
.
reshape
(
cache
[
"v"
],
shape
=
[
0
,
0
,
d_model
]),
v
],
axis
=
1
)
q
=
__split_heads
(
q
,
n_head
)
k
=
__split_heads
(
k
,
n_head
)
v
=
__split_heads
(
v
,
n_head
)
ctx_multiheads
=
scaled_dot_product_attention
(
q
,
k
,
v
,
attn_bias
,
d_key
,
dropout_rate
)
out
=
__combine_heads
(
ctx_multiheads
)
# Project back to the model size.
proj_out
=
layers
.
fc
(
input
=
out
,
size
=
d_model
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_output_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_output_fc.b_0'
)
return
proj_out
def
positionwise_feed_forward
(
x
,
d_inner_hid
,
d_hid
,
dropout_rate
,
hidden_act
,
param_initializer
=
None
,
name
=
'ffn'
):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden
=
layers
.
fc
(
input
=
x
,
size
=
d_inner_hid
,
num_flatten_dims
=
2
,
act
=
hidden_act
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_fc_0.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_fc_0.b_0'
)
if
dropout_rate
:
hidden
=
layers
.
dropout
(
hidden
,
dropout_prob
=
dropout_rate
,
dropout_implementation
=
"upscale_in_train"
,
is_test
=
False
)
out
=
layers
.
fc
(
input
=
hidden
,
size
=
d_hid
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_fc_1.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_fc_1.b_0'
)
return
out
def
pre_post_process_layer
(
prev_out
,
out
,
process_cmd
,
dropout_rate
=
0.
,
name
=
''
):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for
cmd
in
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
out
=
out
+
prev_out
if
prev_out
else
out
elif
cmd
==
"n"
:
# add layer normalization
out_dtype
=
out
.
dtype
if
out_dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
:
out
=
layers
.
cast
(
x
=
out
,
dtype
=
"float32"
)
out
=
layers
.
layer_norm
(
out
,
begin_norm_axis
=
len
(
out
.
shape
)
-
1
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_layer_norm_scale'
,
initializer
=
fluid
.
initializer
.
Constant
(
1.
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_layer_norm_bias'
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
if
out_dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
:
out
=
layers
.
cast
(
x
=
out
,
dtype
=
"float16"
)
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
out
=
layers
.
dropout
(
out
,
dropout_prob
=
dropout_rate
,
dropout_implementation
=
"upscale_in_train"
,
is_test
=
False
)
return
out
pre_process_layer
=
partial
(
pre_post_process_layer
,
None
)
post_process_layer
=
pre_post_process_layer
def
encoder_layer
(
enc_input
,
attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
hidden_act
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
,
param_initializer
=
None
,
name
=
''
):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output
=
multi_head_attention
(
pre_process_layer
(
enc_input
,
preprocess_cmd
,
prepostprocess_dropout
,
name
=
name
+
'_pre_att'
),
None
,
None
,
attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
,
attention_dropout
,
param_initializer
=
param_initializer
,
name
=
name
+
'_multi_head_att'
)
attn_output
=
post_process_layer
(
enc_input
,
attn_output
,
postprocess_cmd
,
prepostprocess_dropout
,
name
=
name
+
'_post_att'
)
ffd_output
=
positionwise_feed_forward
(
pre_process_layer
(
attn_output
,
preprocess_cmd
,
prepostprocess_dropout
,
name
=
name
+
'_pre_ffn'
),
d_inner_hid
,
d_model
,
relu_dropout
,
hidden_act
,
param_initializer
=
param_initializer
,
name
=
name
+
'_ffn'
)
return
post_process_layer
(
attn_output
,
ffd_output
,
postprocess_cmd
,
prepostprocess_dropout
,
name
=
name
+
'_post_ffn'
)
def
encoder
(
enc_input
,
attn_bias
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
hidden_act
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
,
param_initializer
=
None
,
name
=
''
):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for
i
in
range
(
n_layer
):
enc_output
=
encoder_layer
(
enc_input
,
attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
hidden_act
,
preprocess_cmd
,
postprocess_cmd
,
param_initializer
=
param_initializer
,
name
=
name
+
'_layer_'
+
str
(
i
))
enc_input
=
enc_output
enc_output
=
pre_process_layer
(
enc_output
,
preprocess_cmd
,
prepostprocess_dropout
,
name
=
"post_encoder"
)
return
enc_output
build/lib/paddlepalm/default_settings.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
BACKBONE_DIR
=
'paddlepalm.backbone'
TASK_INSTANCE_DIR
=
'paddlepalm.task_instance'
READER_DIR
=
'paddlepalm.reader'
PARADIGM_DIR
=
'paddlepalm.task_paradigm'
OPTIMIZER_DIR
=
'paddlepalm.optimizer'
OPTIMIZE_METHOD
=
'optimize'
REQUIRED_ARGS
=
{
'task_instance'
:
str
,
'backbone'
:
str
,
'optimizer'
:
str
,
'learning_rate'
:
float
,
'batch_size'
:
int
}
OPTIONAL_ARGS
=
{
'mix_ratio'
:
str
,
'target_tag'
:
str
,
'reuse_rag'
:
str
}
TASK_REQUIRED_ARGS
=
{
'paradigm'
:
str
,
'reader'
:
str
,
'train_file'
:
str
}
build/lib/paddlepalm/interface.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""v1.1"""
class
reader
(
object
):
"""interface of data manager."""
def
__init__
(
self
,
config
):
assert
isinstance
(
config
,
dict
)
# @property
# def inputs_attr(self):
# """描述reader输入对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1.
# Return:
# dict类型。对各个输入对象的属性描述。例如,
# 对于文本分类任务,可能需要包含输入文本和所属标签的id
# {"text": ([], 'str'),
# "label": ([], 'int')}
# 对于标注任务,可能需要输入词序列和对应的标签
# {"tokens", ([-1], 'str'),
# "tags", ([-1], 'str')}
# 对于机器阅读理解任务,可能需要包含上下文、问题、回答、答案区域的起止位置等
# {"paragraph", ([], 'str'),
# "question", ([], 'str'),
# "start_position", ([], 'int')
# """
# raise NotImplementedError()
@
property
def
outputs_attr
(
self
):
"""描述reader输出对象(被yield出的对象)的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。
注意:当使用mini-batch梯度下降学习策略时,,应为常规的输入对象设置batch_size维度(一般为-1)
Return:
dict类型。对各个输入对象的属性描述。例如,
对于文本分类和匹配任务,yield的输出内容可能包含如下的对象(下游backbone和task可按需访问其中的对象)
{"token_ids": ([-1, max_len], 'int64'),
"input_ids": ([-1, max_len], 'int64'),
"segment_ids": ([-1, max_len], 'int64'),
"input_mask": ([-1, max_len], 'float32'),
"label": ([-1], 'int')}
"""
raise
NotImplementedError
()
# def parse_line(self):
# """框架内部使用字典描述每个样本,字典的key为inputs_attr,value为每个input对应的符合attr描述的值。
# 该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件,数据集的每一行为json格式描述的样本。
# 用户可通过对该方法的继承改写来适配不同格式的数据集,例如csv格式甚至tfrecord文件。
# """
# raise NotImplementedError()
#
# def tokenize(self, line):
# """框架中内置了word piece tokenizer等分词器,用户可通过修改tokenizer超参数来制定使用的分词器,若内置的分词器均无法满足需求,用户可通过对该方法的继承改写来自定义分词器。
# Args:
# - line: a unicode string.
# Return:
# a list of tokens
# """
# raise NotImplementedError()
def
iterator
(
self
):
"""数据集遍历接口,注意,当数据集遍历到尾部时该接口应自动完成指针重置,即重新从数据集头部开始新的遍历。
Yield:
(dict) elements that meet the requirements in output_templete
"""
raise
NotImplementedError
()
@
property
def
num_examples
(
self
):
"""数据集中的样本数量,即每个epoch中iterator所生成的样本数。注意,使用滑动窗口等可能导致数据集样本数发生变化的策略时,该接口应返回runtime阶段的实际样本数。"""
raise
NotImplementedError
()
class
backbone
(
object
):
"""interface of backbone model."""
def
__init__
(
self
,
config
,
phase
):
"""
Args:
config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数
phase: str类型。运行阶段,目前支持train和predict
"""
assert
isinstance
(
config
,
dict
)
@
property
def
inputs_attr
(
self
):
"""描述backbone从reader处需要得到的输入对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。
Return:
dict类型。对各个输入对象的属性描述。例如,
对于文本分类和匹配任务,bert backbone依赖的reader对象主要包含如下的对象
{"token_ids": ([-1, max_len], 'int64'),
"input_ids": ([-1, max_len], 'int64'),
"segment_ids": ([-1, max_len], 'int64'),
"input_mask": ([-1, max_len], 'float32')}"""
raise
NotImplementedError
()
@
property
def
outputs_attr
(
self
):
"""描述backbone输出对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。
Return:
dict类型。对各个输出对象的属性描述。例如,
对于文本分类和匹配任务,bert backbone的输出内容可能包含如下的对象
{"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'),
"sentence_emb": ([-1, hidden_size], 'float32'),
"sim_vec": ([-1, hidden_size], 'float32')}"""
raise
NotImplementedError
()
def
build
(
self
,
inputs
):
"""建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。
Args:
inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射,inputs中至少会包含inputs_attr中定义的对象
Return:
需要输出的计算图变量,输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。
"""
raise
NotImplementedError
()
class
task_paradigm
(
object
):
def
__init__
(
self
,
config
,
phase
,
backbone_config
):
"""
config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
phase: str类型。运行阶段,目前支持train和predict
"""
@
property
def
inputs_attrs
(
self
):
"""描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性,第一级key为对象集和的名字,如backbone,reader等(后续会支持更灵活的输入),第二级key为对象集和中各对象的属性,包括对象的名字,shape和dtype。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。
Return:
dict类型。对各个对象集及其输入对象的属性描述。"""
raise
NotImplementedError
()
@
property
def
outputs_attr
(
self
):
"""描述task输出对象的属性,包括对象的名字,shape和dtype。输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。
当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。
Return:
dict类型。对各个输入对象的属性描述。注意,训练阶段必须包含名为loss的输出对象。
"""
raise
NotImplementedError
()
def
build
(
self
,
inputs
):
"""建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
Args:
inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射,inputs中至少会包含inputs_attr中定义的对象
Return:
需要输出的计算图变量,输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。
"""
raise
NotImplementedError
()
def
postprocess
(
self
,
rt_outputs
):
"""每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意,rt_outputs除了包含build方法,还自动包含了loss的计算结果。"""
pass
def
post_postprocess
(
self
,
global_buffer
):
pass
build/lib/paddlepalm/mtl_controller.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
importlib
import
multiprocessing
from
paddle
import
fluid
from
paddle.fluid
import
layers
import
yaml
import
json
import
logging
import
time
import
numpy
as
np
from
paddlepalm.utils.saver
import
init_pretraining_params
,
init_checkpoint
from
paddlepalm.utils.config_helper
import
PDConfig
from
paddlepalm.utils.print_helper
import
print_dict
from
paddlepalm.utils.reader_helper
import
create_net_inputs
,
create_iterator_fn
,
create_joint_iterator_fn
,
merge_input_attrs
from
paddlepalm.default_settings
import
*
from
paddlepalm.task_instance
import
TaskInstance
,
check_instances
DEBUG
=
False
VERBOSE
=
0
def
_get_basename
(
f
):
return
os
.
path
.
splitext
(
f
)[
0
]
def
_get_suffix
(
f
):
return
os
.
path
.
splitext
(
f
)[
-
1
]
def
_parse_yaml
(
f
,
asdict
=
True
,
support_cmd_line
=
False
):
assert
os
.
path
.
exists
(
f
),
"file {} not found."
.
format
(
f
)
if
support_cmd_line
:
args
=
PDConfig
(
yaml_file
=
f
,
fuse_args
=
True
)
args
.
build
()
return
args
.
asdict
()
if
asdict
else
args
else
:
if
asdict
:
with
open
(
f
,
"r"
)
as
fin
:
yaml_config
=
yaml
.
load
(
fin
,
Loader
=
yaml
.
SafeLoader
)
return
yaml_config
else
:
raise
NotImplementedError
()
def
_parse_json
(
f
,
asdict
=
True
,
support_cmd_line
=
False
):
assert
os
.
path
.
exists
(
f
),
"file {} not found."
.
format
(
f
)
if
support_cmd_line
:
args
=
PDConfig
(
json_file
=
f
,
fuse_args
=
support_cmd_line
)
args
.
build
()
return
args
.
asdict
()
if
asdict
else
args
else
:
if
asdict
:
with
open
(
f
,
"r"
)
as
fin
:
config
=
json
.
load
(
fin
)
return
config
else
:
raise
NotImplementedError
()
def
_parse_list
(
string
,
astype
=
str
):
assert
isinstance
(
string
,
str
),
"{} is not a string."
.
format
(
string
)
if
','
not
in
string
:
return
[
astype
(
string
)]
string
=
string
.
replace
(
','
,
' '
)
return
[
astype
(
i
)
for
i
in
string
.
split
()]
def
_try_float
(
s
):
try
:
float
(
s
)
return
(
float
(
s
))
except
:
return
s
def
_check_conf
(
conf
,
checklist
=
None
):
assert
isinstance
(
conf
,
dict
),
"{} is not a dict."
.
format
(
conf
)
ret
=
{}
for
k
,
v
in
conf
.
items
():
if
isinstance
(
v
,
str
):
v
=
_try_float
(
v
)
ret
[
k
]
=
v
if
checklist
is
not
None
:
for
k
,
t
in
checklist
:
assert
k
in
ret
,
"required argument {} is NOT exist in config file."
.
format
(
k
)
assert
isintance
(
ret
[
k
],
t
),
"value type of argument {} should be {}"
.
format
(
k
,
t
)
return
ret
# TODO: 增加None机制,允许hidden size、batch size和seqlen设置为None
def
_check_io
(
in_attr
,
out_attr
,
strict
=
False
,
in_name
=
"left"
,
out_name
=
"right"
):
for
name
,
attr
in
in_attr
.
items
():
assert
name
in
out_attr
,
in_name
+
': '
+
name
+
' not found in '
+
out_name
if
attr
!=
out_attr
[
name
]:
if
strict
:
raise
ValueError
(
name
+
': shape or dtype not consistent!'
)
else
:
logging
.
warning
(
'{}: shape or dtype not consistent!
\n
{}:
\n
{}
\n
{}:
\n
{}'
.
format
(
name
,
in_name
,
attr
,
out_name
,
out_attr
[
name
]))
def
_merge_conf
(
conf1
,
conf2
,
conf1_first
=
True
,
strict
=
False
):
assert
isinstance
(
conf1
,
dict
),
"{} is not a dict."
.
format
(
conf1
)
assert
isinstance
(
conf2
,
dict
),
"{} is not a dict."
.
format
(
conf2
)
base_conf
=
conf2
if
conf1_first
else
conf1
base_conf
=
base_conf
.
copy
()
new_conf
=
conf1
if
conf1_first
else
conf2
for
k
,
v
in
new_conf
.
items
():
if
k
in
base_conf
:
if
base_conf
[
k
]
!=
v
:
raise
Warning
(
"value of argument {} has been updated to {}."
.
format
(
k
,
v
))
else
:
if
strict
:
continue
base_conf
[
k
]
=
v
return
base_conf
def
_encode_inputs
(
inputs
,
scope_name
,
sep
=
'/'
,
cand_set
=
None
):
outputs
=
{}
for
k
,
v
in
inputs
.
items
():
if
cand_set
is
not
None
:
if
k
in
cand_set
:
outputs
[
k
]
=
v
if
scope_name
+
sep
+
k
in
cand_set
:
outputs
[
scope_name
+
sep
+
k
]
=
v
else
:
outputs
[
scope_name
+
sep
+
k
]
=
v
return
outputs
def
_decode_inputs
(
inputs
,
scope_name
,
sep
=
'/'
,
keep_unk_keys
=
True
):
outputs
=
{}
for
name
,
value
in
inputs
.
items
():
# var for backbone are also available to tasks
if
keep_unk_keys
and
sep
not
in
name
:
outputs
[
name
]
=
value
# var for this inst
if
name
.
startswith
(
scope_name
+
'/'
):
outputs
[
name
[
len
(
scope_name
+
'/'
):]]
=
value
return
outputs
def
_init_env
(
use_gpu
):
if
use_gpu
:
place
=
fluid
.
CUDAPlace
(
0
)
dev_count
=
fluid
.
core
.
get_cuda_device_count
()
else
:
place
=
fluid
.
CPUPlace
()
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
return
fluid
.
Executor
(
place
),
dev_count
def
_fit_attr
(
conf
,
fit_attr
,
strict
=
False
):
for
i
,
attr
in
fit_attr
.
items
():
if
i
not
in
conf
:
if
strict
:
raise
Exception
(
'Argument {} is required to create a controller.'
.
format
(
i
))
else
:
continue
conf
[
i
]
=
attr
(
conf
[
i
])
return
conf
class
Controller
(
object
):
def
__init__
(
self
,
config
=
None
,
task_dir
=
'.'
,
for_train
=
True
):
"""
Args:
config: (str|dict) 字符串类型时,给出yaml格式的config配置文件路径;
"""
self
.
_for_train
=
for_train
# default mtl_conf
# if config is None and config_path is None:
# raise ValueError('For config and config_path, at least one of them should be set.')
if
isinstance
(
config
,
str
):
mtl_conf
=
_parse_yaml
(
config
,
support_cmd_line
=
True
)
# if config is not None:
# mtl_conf = _merge_conf(config, mtl_conf)
else
:
mtl_conf
=
config
mtl_conf
=
_check_conf
(
mtl_conf
)
mtl_conf
=
_fit_attr
(
mtl_conf
,
REQUIRED_ARGS
,
strict
=
True
)
mtl_conf
=
_fit_attr
(
mtl_conf
,
OPTIONAL_ARGS
,
strict
=
False
)
exe
,
dev_count
=
_init_env
(
use_gpu
=
mtl_conf
.
get
(
'use_gpu'
,
True
))
self
.
exe
=
exe
self
.
dev_count
=
dev_count
print_dict
(
mtl_conf
,
title
=
'main configuration'
)
# parse task instances and target tags
instnames
=
_parse_list
(
mtl_conf
[
'task_instance'
])
assert
len
(
instnames
)
==
len
(
set
(
instnames
)),
"repeated task_instance is NOT supported."
num_instances
=
len
(
instnames
)
self
.
num_instances
=
num_instances
instname_to_conf
=
{}
instname_to_id
=
{}
for
id
,
instname
in
enumerate
(
instnames
):
instpath
=
os
.
path
.
join
(
task_dir
,
instname
+
'.yaml'
)
conf
=
_parse_yaml
(
instpath
,
support_cmd_line
=
False
)
# conf = _check_conf(conf, TASK_INSTANCE_REQUIRED_ARGS)
conf
=
_check_conf
(
conf
)
temp_conf
=
_merge_conf
(
mtl_conf
,
conf
,
strict
=
True
)
print_dict
(
temp_conf
,
title
=
'{} configuration'
.
format
(
instname
))
conf
=
_merge_conf
(
mtl_conf
,
conf
)
instname_to_conf
[
instname
]
=
conf
instname_to_id
[
instname
]
=
id
# create task instances
instances
=
[]
for
name
in
instnames
:
instances
.
append
(
TaskInstance
(
name
,
instname_to_id
[
name
],
instname_to_conf
[
name
]))
check_instances
(
instances
)
# parse target_tag
if
'target_tag'
in
mtl_conf
:
target_tag
=
str
(
mtl_conf
[
'target_tag'
])
tags
=
_parse_list
(
target_tag
,
astype
=
int
)
assert
len
(
tags
)
==
len
(
instnames
),
"number of target_tag is NOT consistent with that in task_instance."
for
tag
,
inst
in
zip
(
tags
,
instances
):
inst
.
is_target
=
tag
else
:
tags
=
[
i
.
is_target
for
i
in
instances
]
num_targets
=
sum
(
tags
)
num_auxes
=
num_instances
-
num_targets
# parse mix ratios
if
'mix_ratio'
in
mtl_conf
:
mix_ratio
=
str
(
mtl_conf
[
'mix_ratio'
])
mrs
=
_parse_list
(
mix_ratio
,
astype
=
float
)
assert
len
(
mrs
)
==
num_instances
,
"number of mix_ratios is NOT consistent with num_instances."
else
:
# TODO: 增加joint training模式,让num_epochs平等的作用于每个instance
mrs
=
[
1.0
]
*
num_instances
for
mr
,
inst
in
zip
(
mrs
,
instances
):
inst
.
mix_ratio
=
mr
# parse task layer reuse tags
instname_to_reusehost
=
{
i
:
i
for
i
in
instnames
}
if
'task_reuse_tag'
in
mtl_conf
:
tags
=
_parse_list
(
mtl_conf
[
'task_reuse_tag'
],
astype
=
int
)
assert
len
(
tags
)
==
num_targets
,
'number of reuse_tags is NOT consistent with number of instances.'
else
:
tags
=
[]
mapper
=
{}
for
inst
in
instances
:
# 有环则tag_id + 1,否则被mapper shutdown
history
=
set
()
history
.
add
(
inst
.
name
)
cur_inst
=
inst
while
True
:
# 发现有环
if
cur_inst
.
task_reuse_scope
in
history
:
mapper
[
inst
.
name
]
=
len
(
tags
)
break
# 发现在mapper中
elif
cur_inst
.
task_reuse_scope
in
mapper
:
mapper
[
inst
.
name
]
=
mapper
[
cur_inst
.
task_reuse_scope
]
break
else
:
cur_inst
=
name_to_instance
[
cur_inst
.
task_reuse_scope
]
history
.
add
(
cur_inst
.
name
)
tags
.
append
(
mapper
[
inst
.
name
])
# 注意,上面这段需要做单元测试
for
i
in
range
(
1
,
num_instances
):
for
j
in
range
(
i
):
if
tags
[
i
]
==
tags
[
j
]:
# check paradigm of reused tasks
assert
instances
[
i
].
task_paradigm
==
\
instances
[
j
].
task_paradigm
,
\
"paradigm of reuse tasks should be consistent"
instances
[
i
].
task_reuse_scope
=
instances
[
j
].
name
break
# parse Reader and Paradigm for each instance
for
inst
in
instances
:
reader_name
=
inst
.
config
[
'reader'
]
reader_mod
=
importlib
.
import_module
(
READER_DIR
+
'.'
+
reader_name
)
Reader
=
getattr
(
reader_mod
,
'Reader'
)
parad_name
=
inst
.
config
[
'paradigm'
]
parad_mod
=
importlib
.
import_module
(
PARADIGM_DIR
+
'.'
+
parad_name
)
Paradigm
=
getattr
(
parad_mod
,
'TaskParadigm'
)
inst
.
Reader
=
Reader
inst
.
Paradigm
=
Paradigm
# prepare backbone
if
'backbone_config_path'
in
mtl_conf
:
bb_conf
=
_parse_json
(
mtl_conf
[
'backbone_config_path'
])
bb_conf
=
_merge_conf
(
mtl_conf
,
bb_conf
)
else
:
bb_conf
=
mtl_conf
print_dict
(
bb_conf
,
title
=
'backbone configuration'
.
format
(
instname
))
bb_name
=
mtl_conf
[
'backbone'
]
bb_mod
=
importlib
.
import_module
(
BACKBONE_DIR
+
'.'
+
bb_name
)
Backbone
=
getattr
(
bb_mod
,
'Model'
)
self
.
instances
=
instances
self
.
mrs
=
mrs
self
.
Backbone
=
Backbone
self
.
bb_conf
=
bb_conf
self
.
bb_name
=
bb_name
self
.
has_init_train
=
False
self
.
has_init_pred
=
False
if
self
.
_for_train
:
print
(
"initialing for training..."
)
self
.
_init_train
()
self
.
has_init_train
=
True
def
_init_train
(
self
):
instances
=
self
.
instances
Backbone
=
self
.
Backbone
bb_conf
=
self
.
bb_conf
bb_name
=
self
.
bb_name
dev_count
=
self
.
dev_count
num_instances
=
len
(
instances
)
mrs
=
self
.
mrs
# set first_target/main task instance
main_inst
=
None
for
inst
in
instances
:
if
inst
.
is_target
:
main_inst
=
inst
inst
.
is_first_target
=
True
break
main_conf
=
main_inst
.
config
if
not
os
.
path
.
exists
(
main_conf
[
'save_path'
]):
os
.
makedirs
(
main_conf
[
'save_path'
])
# prepare backbone
train_backbone
=
Backbone
(
bb_conf
,
phase
=
'train'
)
pred_backbone
=
Backbone
(
bb_conf
,
phase
=
'pred'
)
# create reader, task
# then check i/o across reader, backbone and task_layer
task_attrs
=
[]
pred_task_attrs
=
[]
for
inst
in
instances
:
train_reader
=
inst
.
Reader
(
inst
.
config
,
phase
=
'train'
)
inst
.
reader
[
'train'
]
=
train_reader
train_parad
=
inst
.
Paradigm
(
inst
.
config
,
phase
=
'train'
,
backbone_config
=
bb_conf
)
inst
.
task_layer
[
'train'
]
=
train_parad
task_attr_from_reader
=
_encode_inputs
(
train_parad
.
inputs_attrs
[
'reader'
],
inst
.
name
)
task_attrs
.
append
(
task_attr_from_reader
)
_check_io
(
train_backbone
.
inputs_attr
,
train_reader
.
outputs_attr
,
in_name
=
bb_name
+
'_backbone'
,
out_name
=
'reader.train'
)
_check_io
(
train_parad
.
inputs_attrs
[
'reader'
],
train_reader
.
outputs_attr
,
in_name
=
'task_paradigm.train.reader'
,
out_name
=
'reader.train'
)
_check_io
(
train_parad
.
inputs_attrs
[
'backbone'
],
train_backbone
.
outputs_attr
,
in_name
=
'task_paradigm.train.backbone'
,
out_name
=
bb_name
+
'_backbone'
)
if
inst
.
is_target
:
if
'pred_file'
not
in
inst
.
config
:
inst
.
config
[
'pred_file'
]
=
''
pred_reader
=
inst
.
Reader
(
inst
.
config
,
phase
=
'pred'
)
pred_parad
=
inst
.
Paradigm
(
inst
.
config
,
phase
=
'pred'
,
backbone_config
=
bb_conf
)
# inst.reader['pred'] = pred_reader # 这里创建的reader是个假reader,只是为了读取output_attr而已,所以不做保存
inst
.
task_layer
[
'pred'
]
=
pred_parad
# 框架有巨坑,先这样写吧
task_attr_from_reader
=
_encode_inputs
(
pred_parad
.
inputs_attrs
[
'reader'
],
inst
.
name
)
pred_task_attrs
.
append
(
task_attr_from_reader
)
# task_attr = pred_parad.inputs_attrs['reader']
_check_io
(
pred_backbone
.
inputs_attr
,
pred_reader
.
outputs_attr
,
in_name
=
bb_name
+
'_backbone'
,
out_name
=
'reader.pred'
)
_check_io
(
pred_parad
.
inputs_attrs
[
'reader'
],
pred_reader
.
outputs_attr
,
in_name
=
'task_paradigm.pred.reader'
,
out_name
=
'reader.pred'
)
_check_io
(
pred_parad
.
inputs_attrs
[
'backbone'
],
pred_backbone
.
outputs_attr
,
in_name
=
'task_paradigm.pred.backbone'
,
out_name
=
bb_name
+
'_backbone'
)
# merge reader input attrs from backbone and task_instances
joint_input_names
,
joint_shape_and_dtypes
,
name_to_position
=
merge_input_attrs
(
train_backbone
.
inputs_attr
,
task_attrs
)
pred_joint_input_names
,
pred_joint_shape_and_dtypes
,
_
=
merge_input_attrs
(
pred_backbone
.
inputs_attr
,
pred_task_attrs
,
insert_taskid
=
False
)
# shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]
if
DEBUG
:
print
(
'----- for debug -----'
)
print
(
'joint input names:'
)
print
(
joint_input_names
)
print
(
'joint input shape and dtypes:'
)
print
(
joint_shape_and_dtypes
)
# load data
for
inst
in
instances
:
print
(
inst
.
name
+
": preparing data..."
)
inst
.
reader
[
'train'
].
load_data
()
# merge dataset iterators and create net input vars
iterators
=
[]
prefixes
=
[]
mrs
=
[]
for
inst
in
instances
:
iterators
.
append
(
inst
.
reader
[
'train'
].
iterator
())
prefixes
.
append
(
inst
.
name
)
mrs
.
append
(
inst
.
mix_ratio
)
joint_iterator_fn
=
create_joint_iterator_fn
(
iterators
,
prefixes
,
joint_shape_and_dtypes
,
mrs
,
name_to_position
,
dev_count
=
dev_count
,
verbose
=
VERBOSE
,
batch_size
=
main_conf
[
'batch_size'
])
input_attrs
=
[[
i
,
j
,
k
]
for
i
,
(
j
,
k
)
in
zip
(
joint_input_names
,
joint_shape_and_dtypes
)]
pred_input_attrs
=
[[
i
,
j
,
k
]
for
i
,
(
j
,
k
)
in
zip
(
pred_joint_input_names
,
pred_joint_shape_and_dtypes
)]
net_inputs
=
create_net_inputs
(
input_attrs
,
async
=
True
,
iterator_fn
=
joint_iterator_fn
,
dev_count
=
dev_count
,
n_prefetch
=
3
)
# build backbone and task layers
# 不指定scope名字会挂,框架有坑
with
fluid
.
unique_name
.
guard
(
"backbone-"
):
bb_output_vars
=
train_backbone
.
build
(
net_inputs
)
# bb_output_vars = train_backbone.build(net_inputs)
assert
sorted
(
bb_output_vars
.
keys
())
==
sorted
(
train_backbone
.
outputs_attr
.
keys
())
# 会挂
# 这里是否有必要新建一个program?是的,被坑死了
pred_prog
=
fluid
.
Program
()
pred_init_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
default_main_program
()
train_init_prog
=
fluid
.
default_startup_program
()
with
fluid
.
program_guard
(
main_program
=
pred_prog
,
startup_program
=
pred_init_prog
):
pred_net_inputs
=
create_net_inputs
(
pred_input_attrs
)
with
fluid
.
unique_name
.
guard
(
"backbone-"
):
pred_bb_output_vars
=
pred_backbone
.
build
(
pred_net_inputs
)
fluid
.
framework
.
switch_main_program
(
train_prog
)
fluid
.
framework
.
switch_startup_program
(
train_init_prog
)
# pred_backbone = train_backbone
# pred_bb_output_vars = bb_output_vars
task_output_vars
=
{}
for
inst
in
instances
:
task_inputs
=
{
'backbone'
:
bb_output_vars
}
task_inputs_from_reader
=
_decode_inputs
(
net_inputs
,
inst
.
name
)
task_inputs
[
'reader'
]
=
task_inputs_from_reader
scope
=
inst
.
task_reuse_scope
+
'/'
with
fluid
.
unique_name
.
guard
(
scope
):
output_vars
=
inst
.
build_task_layer
(
task_inputs
,
phase
=
'train'
)
output_vars
=
{
inst
.
name
+
'/'
+
key
:
val
for
key
,
val
in
output_vars
.
items
()}
old
=
len
(
task_output_vars
)
# for debug
task_output_vars
.
update
(
output_vars
)
assert
len
(
task_output_vars
)
-
old
==
len
(
output_vars
)
# for debug
# # prepare predict vars for saving inference model
if
inst
.
is_target
:
# task_attr = inst.task_layer['pred'].inputs_attrs['reader']
# _input_names, _shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, task_attr, insert_taskid=False)
# pred_input_attrs = [[i, j, k] for i, (j,k) in zip(_input_names, _shape_and_dtypes)]
with
fluid
.
program_guard
(
pred_prog
,
pred_init_prog
):
# pred_net_inputs = create_net_inputs(pred_input_attrs)
# 这里同时建立了pred阶段的backbone计算图,不知道是否会造成额外的显存开销(paddle不会计算运行路径)
cur_inputs
=
_decode_inputs
(
pred_net_inputs
,
inst
.
name
)
inst
.
pred_input
=
cur_inputs
pred_task_inputs
=
{
'backbone'
:
pred_bb_output_vars
,
'reader'
:
cur_inputs
}
scope
=
inst
.
task_reuse_scope
+
'/'
with
fluid
.
unique_name
.
guard
(
scope
):
inst
.
build_task_layer
(
pred_task_inputs
,
phase
=
'pred'
)
bb_fetches
=
{
k
:
v
.
name
for
k
,
v
in
bb_output_vars
.
items
()}
task_fetches
=
{
k
:
v
.
name
for
k
,
v
in
task_output_vars
.
items
()}
# fetches = bb_fetches.copy() # 注意!框架在多卡时无法fetch变长维度的tensor,这里加入bb的out后会挂
# fetches.update(task_fetches)
fetches
=
task_fetches
fetches
[
'__task_id'
]
=
net_inputs
[
'__task_id'
].
name
# compute loss
task_id_var
=
net_inputs
[
'__task_id'
]
task_id_vec
=
layers
.
one_hot
(
task_id_var
,
num_instances
)
losses
=
fluid
.
layers
.
concat
([
task_output_vars
[
inst
.
name
+
'/loss'
]
for
inst
in
instances
],
axis
=
0
)
loss
=
layers
.
reduce_sum
(
task_id_vec
*
losses
)
main_reader
=
main_inst
.
reader
[
'train'
]
num_examples
=
main_reader
.
num_examples
for
inst
in
instances
:
max_train_steps
=
int
(
main_conf
[
'num_epochs'
]
*
inst
.
mix_ratio
*
num_examples
)
//
main_conf
[
'batch_size'
]
//
dev_count
if
inst
.
is_target
:
print
(
'{}: expected train steps {}.'
.
format
(
inst
.
name
,
max_train_steps
))
inst
.
steps_pur_epoch
=
inst
.
reader
[
'train'
].
num_examples
//
main_conf
[
'batch_size'
]
//
dev_count
inst
.
expected_train_steps
=
max_train_steps
global_max_train_steps
=
int
(
main_conf
[
'num_epochs'
]
*
num_examples
*
sum
(
mrs
))
//
main_conf
[
'batch_size'
]
//
dev_count
print
(
'Estimated overall train steps {}.'
.
format
(
global_max_train_steps
))
if
'warmup_proportion'
in
main_conf
and
main_conf
[
'warmup_proportion'
]
>
0
:
warmup_steps
=
int
(
global_max_train_steps
*
main_conf
[
'warmup_proportion'
])
print
(
'Warmup steps: '
+
str
(
warmup_steps
))
else
:
warmup_steps
=
0
# steps_pur_epoch = num_examples // main_conf['batch_size'] // dev_count
# build optimizer
# 其实也完全可以支持每个任务用它自己的optimizer
if
'optimizer'
in
main_conf
:
optim_mod
=
importlib
.
import_module
(
OPTIMIZER_DIR
+
'.'
+
main_conf
[
'optimizer'
])
optimize
=
getattr
(
optim_mod
,
OPTIMIZE_METHOD
)
optimize
(
loss
,
main_conf
,
max_train_steps
,
warmup_steps
,
fluid
.
default_main_program
())
loss
.
persistable
=
True
if
main_conf
.
get
(
'use_ema'
,
False
):
assert
'ema_decay'
in
main_conf
,
"ema_decay should be set when use_ema is enabled."
ema
=
fluid
.
optimizer
.
ExponentialMovingAverage
(
main_conf
[
'ema_decay'
])
ema
.
update
()
# prepare for train
self
.
train_backbone
=
train_backbone
self
.
train_program
=
fluid
.
CompiledProgram
(
fluid
.
default_main_program
()).
with_data_parallel
(
loss_name
=
loss
.
name
)
self
.
saver_program
=
fluid
.
default_main_program
()
self
.
main_inst
=
main_inst
self
.
fetches
=
fetches
self
.
has_init_train
=
True
self
.
has_init_pred
=
True
# self.max_train_steps = max_train_steps
# self.steps_pur_epoch = steps_pur_epoch
self
.
exe
.
run
(
fluid
.
default_startup_program
())
print
(
"
\n
Randomly initialize parameters...
\n
"
)
def
_init_pred
(
self
,
instance
,
infer_model_path
):
inst
=
instance
pred_backbone
=
self
.
Backbone
(
self
.
bb_conf
,
phase
=
'pred'
)
pred_parad
=
inst
.
Paradigm
(
inst
.
config
,
phase
=
'pred'
,
backbone_config
=
self
.
bb_conf
)
inst
.
task_layer
[
'pred'
]
=
pred_parad
pred_joint_input_names
,
pred_joint_shape_and_dtypes
,
name_to_position
=
merge_input_attrs
(
pred_backbone
.
inputs_attr
,
inst
.
task_layer
[
'pred'
].
inputs_attrs
[
'reader'
],
insert_taskid
=
False
)
pred_prog
=
inst
.
load
(
infer_model_path
)
# pred_prog = fluid.CompiledProgram(pred_prog).with_data_parallel()
if
inst
.
reader
[
'pred'
]
is
None
:
pred_reader
=
inst
.
Reader
(
inst
.
config
,
phase
=
'pred'
)
inst
.
reader
[
'pred'
]
=
pred_reader
return
pred_prog
def
load_pretrain
(
self
,
pretrain_model_path
=
None
):
# load pretrain model (or ckpt)
if
pretrain_model_path
is
None
:
assert
'pretrain_model_path'
in
self
.
main_conf
,
"pretrain_model_path NOT set."
pretrain_model_path
=
self
.
main_conf
[
'pretrain_model_path'
]
init_pretraining_params
(
self
.
exe
,
pretrain_model_path
,
main_program
=
fluid
.
default_startup_program
())
def
train
(
self
):
# TODO: 备份各种配置文件,以便用户断点重新训练以及支持将来的预测
if
not
self
.
has_init_train
:
self
.
_init_train
()
self
.
has_init_train
=
True
instances
=
self
.
instances
num_instances
=
self
.
num_instances
main_inst
=
self
.
main_inst
main_conf
=
main_inst
.
config
backbone
=
self
.
train_backbone
train_program
=
self
.
train_program
saver_program
=
self
.
saver_program
fetches
=
self
.
fetches
# max_train_steps = self.max_train_steps
# steps_pur_epoch = self.steps_pur_epoch
finish
=
[]
for
inst
in
instances
:
if
inst
.
is_target
:
finish
.
append
(
False
)
def
train_finish
():
for
inst
in
instances
:
if
inst
.
is_target
:
if
not
inst
.
train_finish
:
return
False
return
True
# do training
# loss_fetches = {inst.name+'/loss': inst.task_layer['train'].loss for inst in instances}
# old = len(fetches) # for debug
# fetches.update(loss_fetches)
# assert len(fetches) == old + len(loss_fetches) # for debug and avoid user-caused bug
# assert 'task_id' not in fetches # for debug and avoid user-caused bug
# fetches['task_id'] = task_id_var
fetch_names
,
fetch_list
=
zip
(
*
fetches
.
items
())
main_step
=
0
# only count for main task
global_step
=
0
# count for all tasks
epoch
=
0
time_begin
=
time
.
time
()
backbone_buffer
=
[]
while
not
train_finish
():
rt_outputs
=
self
.
exe
.
run
(
train_program
,
fetch_list
=
fetch_list
)
rt_outputs
=
{
k
:
v
for
k
,
v
in
zip
(
fetch_names
,
rt_outputs
)}
rt_task_id
=
np
.
squeeze
(
rt_outputs
[
'__task_id'
]).
tolist
()
assert
(
not
isinstance
(
rt_task_id
,
list
))
or
len
(
set
(
rt_task_id
))
==
1
,
rt_task_id
rt_task_id
=
rt_task_id
[
0
]
if
isinstance
(
rt_task_id
,
list
)
else
rt_task_id
cur_task
=
instances
[
rt_task_id
]
backbone_rt_outputs
=
{
k
:
v
for
k
,
v
in
rt_outputs
.
items
()
if
'/'
not
in
k
}
backbone_buffer
.
append
(
backbone
.
postprocess
(
backbone_rt_outputs
))
task_rt_outputs
=
{
k
[
len
(
cur_task
.
name
+
'/'
):]:
v
for
k
,
v
in
rt_outputs
.
items
()
if
k
.
startswith
(
cur_task
.
name
+
'/'
)}
instances
[
rt_task_id
].
task_layer
[
'train'
].
postprocess
(
task_rt_outputs
)
global_step
+=
1
# if cur_task.is_target:
cur_task
.
cur_train_step
+=
1
if
global_step
%
main_conf
.
get
(
'print_every_n_steps'
,
5
)
==
0
:
loss
=
rt_outputs
[
cur_task
.
name
+
'/loss'
]
loss
=
np
.
mean
(
np
.
squeeze
(
loss
)).
tolist
()
time_end
=
time
.
time
()
time_cost
=
time_end
-
time_begin
print
(
"Global step: {}. Task: {}, step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s"
.
format
(
global_step
,
cur_task
.
name
,
cur_task
.
cur_train_step
,
cur_task
.
steps_pur_epoch
,
cur_task
.
cur_train_epoch
,
loss
,
main_conf
.
get
(
'print_every_n_steps'
,
5
)
/
time_cost
))
time_begin
=
time
.
time
()
if
'save_every_n_steps'
in
main_conf
and
global_step
%
main_conf
[
'save_every_n_steps'
]
==
0
:
save_path
=
os
.
path
.
join
(
main_conf
[
'save_path'
],
"step_"
+
str
(
global_step
))
fluid
.
io
.
save_persistables
(
self
.
exe
,
save_path
,
saver_program
)
save_path
=
os
.
path
.
join
(
main_conf
[
'save_path'
],
"step_"
+
str
(
global_step
)
+
"_final"
)
fluid
.
io
.
save_persistables
(
self
.
exe
,
save_path
,
saver_program
)
def
pred
(
self
,
task_instance
,
inference_model_dir
=
None
):
if
self
.
_for_train
:
raise
Exception
(
'This controller is a trainer. Please build a new controller with for_train=False for predicting.'
)
assert
isinstance
(
task_instance
,
str
)
if
isinstance
(
inference_model_dir
,
str
):
assert
os
.
path
.
exists
(
inference_model_dir
),
inference_model_dir
+
" not found."
if
not
self
.
has_init_pred
and
inference_model_dir
is
None
:
raise
ValueError
(
'infer_model_path is required for prediction.'
)
instance
=
None
for
inst
in
self
.
instances
:
if
inst
.
name
==
task_instance
:
instance
=
inst
break
if
instance
is
None
:
raise
ValueError
(
task_instance
+
' is not a valid task_instance.'
)
pred_prog
=
self
.
_init_pred
(
instance
,
inference_model_dir
)
inst
=
instance
inst
.
reader
[
'pred'
].
load_data
()
fetch_names
,
fetch_vars
=
inst
.
pred_fetch_list
# iterator = create_iterator_fn(inst.reader['pred'].iterator, inst.name, pred_joint_shape_and_dtypes, name_to_position)
mapper
=
{
k
:
v
for
k
,
v
in
inst
.
pred_input
}
buf
=
[]
for
feed
in
inst
.
reader
[
'pred'
].
iterator
():
feed
=
_encode_inputs
(
feed
,
inst
.
name
,
cand_set
=
mapper
)
feed
=
{
mapper
[
k
]:
v
for
k
,
v
in
feed
.
items
()}
rt_outputs
=
self
.
exe
.
run
(
pred_prog
,
feed
,
fetch_vars
)
rt_outputs
=
{
k
:
v
for
k
,
v
in
zip
(
fetch_names
,
rt_outputs
)}
inst
.
postprocess
(
rt_outputs
,
phase
=
'pred'
)
reader_outputs
=
inst
.
reader
[
'pred'
].
get_epoch_outputs
()
inst
.
epoch_postprocess
({
'reader'
:
reader_outputs
},
phase
=
'pred'
)
if
__name__
==
'__main__'
:
assert
len
(
sys
.
argv
)
==
2
,
"Usage: python mtl_controller.py <mtl_conf_path>"
conf_path
=
sys
.
argv
[
1
]
del
sys
.
argv
[
1
]
controller
=
Controller
(
conf_path
)
if
controller
.
main_conf
[
'do_train'
]:
controller
.
train
()
build/lib/paddlepalm/optimizer/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/optimizer/adam.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
paddle.fluid
as
fluid
def
linear_warmup_decay
(
learning_rate
,
warmup_steps
,
num_train_steps
):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with
fluid
.
default_main_program
().
_lr_schedule_guard
():
lr
=
fluid
.
layers
.
tensor
.
create_global_var
(
shape
=
[
1
],
value
=
0.0
,
dtype
=
'float32'
,
persistable
=
True
,
name
=
"scheduled_learning_rate"
)
global_step
=
fluid
.
layers
.
learning_rate_scheduler
.
_decay_step_counter
()
with
fluid
.
layers
.
control_flow
.
Switch
()
as
switch
:
with
switch
.
case
(
global_step
<
warmup_steps
):
warmup_lr
=
learning_rate
*
(
global_step
/
warmup_steps
)
fluid
.
layers
.
tensor
.
assign
(
warmup_lr
,
lr
)
with
switch
.
default
():
decayed_lr
=
fluid
.
layers
.
learning_rate_scheduler
.
polynomial_decay
(
learning_rate
=
learning_rate
,
decay_steps
=
num_train_steps
,
end_learning_rate
=
0.0
,
power
=
1.0
,
cycle
=
False
)
fluid
.
layers
.
tensor
.
assign
(
decayed_lr
,
lr
)
return
lr
def
optimize
(
loss
,
config
,
max_train_steps
=
None
,
warmup_steps
=
0
,
train_program
=
None
):
if
warmup_steps
>
0
:
decay_strategy
=
config
.
get
(
'lr_scheduler'
,
'linear_warmup_decay'
)
if
decay_strategy
==
'noam_decay'
:
scheduled_lr
=
fluid
.
layers
.
learning_rate_scheduler
\
.
noam_decay
(
1
/
(
warmup_steps
*
(
config
[
'learning_rate'
]
**
2
)),
warmup_steps
)
elif
decay_strategy
==
'linear_warmup_decay'
:
scheduled_lr
=
linear_warmup_decay
(
config
[
'learning_rate'
],
warmup_steps
,
max_train_steps
)
else
:
raise
ValueError
(
"Unkown lr_scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'"
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
scheduled_lr
)
else
:
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
config
[
'learning_rate'
])
scheduled_lr
=
config
[
'learning_rate'
]
clip_norm_thres
=
1.0
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
fluid
.
clip
.
set_gradient_clip
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
clip_norm_thres
))
def
exclude_from_weight_decay
(
name
):
if
name
.
find
(
"layer_norm"
)
>
-
1
:
return
True
bias_suffix
=
[
"_bias"
,
"_b"
,
".b_0"
]
for
suffix
in
bias_suffix
:
if
name
.
endswith
(
suffix
):
return
True
return
False
param_list
=
dict
()
for
param
in
train_program
.
global_block
().
all_parameters
():
param_list
[
param
.
name
]
=
param
*
1.0
param_list
[
param
.
name
].
stop_gradient
=
True
_
,
param_grads
=
optimizer
.
minimize
(
loss
)
for
block
in
fluid
.
default_main_program
().
blocks
:
for
var_name
in
block
.
vars
:
if
var_name
.
startswith
(
"embedding"
):
print
(
block
.
vars
[
var_name
])
if
config
.
get
(
'weight_decay'
,
0
)
>
0
:
for
param
,
grad
in
param_grads
:
if
exclude_from_weight_decay
(
param
.
name
):
continue
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]),
fluid
.
framework
.
name_scope
(
"weight_decay"
):
updated_param
=
param
-
param_list
[
param
.
name
]
*
config
[
'weight_decay'
]
*
scheduled_lr
fluid
.
layers
.
assign
(
output
=
param
,
input
=
updated_param
)
build/lib/paddlepalm/reader/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/reader/cls4bert.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/reader/match4ernie.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddlepalm.interface
import
reader
from
paddlepalm.reader.utils.reader4ernie
import
ClassifyReader
class
Reader
(
reader
):
def
__init__
(
self
,
config
,
phase
=
'train'
,
dev_count
=
1
,
print_prefix
=
''
):
"""
Args:
phase: train, eval, pred
"""
self
.
_is_training
=
phase
==
'train'
reader
=
ClassifyReader
(
config
[
'vocab_path'
],
max_seq_len
=
config
[
'max_seq_len'
],
do_lower_case
=
config
.
get
(
'do_lower_case'
,
False
),
for_cn
=
config
.
get
(
'for_cn'
,
False
),
random_seed
=
config
.
get
(
'seed'
,
None
))
self
.
_reader
=
reader
self
.
_dev_count
=
dev_count
self
.
_batch_size
=
config
[
'batch_size'
]
self
.
_max_seq_len
=
config
[
'max_seq_len'
]
if
phase
==
'train'
:
self
.
_input_file
=
config
[
'train_file'
]
self
.
_num_epochs
=
None
# 防止iteartor终止
self
.
_shuffle
=
config
.
get
(
'shuffle'
,
False
)
self
.
_shuffle_buffer
=
config
.
get
(
'shuffle_buffer'
,
5000
)
elif
phase
==
'eval'
:
self
.
_input_file
=
config
[
'dev_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
elif
phase
==
'pred'
:
self
.
_input_file
=
config
[
'pred_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
self
.
_phase
=
phase
# self._batch_size =
self
.
_print_first_n
=
config
.
get
(
'print_first_n'
,
1
)
@
property
def
outputs_attr
(
self
):
if
self
.
_is_training
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"label_ids"
:
[[
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
]
}
else
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
]
}
def
load_data
(
self
):
self
.
_data_generator
=
self
.
_reader
.
data_generator
(
self
.
_input_file
,
self
.
_batch_size
,
self
.
_num_epochs
,
dev_count
=
self
.
_dev_count
,
shuffle
=
self
.
_shuffle
,
phase
=
self
.
_phase
)
def
iterator
(
self
):
def
list_to_dict
(
x
):
names
=
[
'token_ids'
,
'segment_ids'
,
'position_ids'
,
'task_ids'
,
'input_mask'
,
'label_ids'
,
'unique_ids'
]
outputs
=
{
n
:
i
for
n
,
i
in
zip
(
names
,
x
)}
del
outputs
[
'unique_ids'
]
if
not
self
.
_is_training
:
del
outputs
[
'label_ids'
]
return
outputs
for
batch
in
self
.
_data_generator
():
yield
list_to_dict
(
batch
)
def
get_epoch_outputs
(
self
):
return
{
'examples'
:
self
.
_reader
.
get_examples
(
self
.
_phase
),
'features'
:
self
.
_reader
.
get_features
(
self
.
_phase
)}
@
property
def
num_examples
(
self
):
return
self
.
_reader
.
get_num_examples
(
phase
=
self
.
_phase
)
build/lib/paddlepalm/reader/mlm.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddlepalm.interface
import
reader
from
paddlepalm.reader.utils.reader4ernie
import
BaseReader
class
Reader
(
reader
):
def
__init__
(
self
,
config
,
phase
=
'train'
,
dev_count
=
1
,
print_prefix
=
''
):
"""
Args:
phase: train, eval, pred
"""
self
.
_is_training
=
phase
==
'train'
reader
=
ClassifyReader
(
config
[
'vocab_path'
],
max_seq_len
=
config
[
'max_seq_len'
],
do_lower_case
=
config
.
get
(
'do_lower_case'
,
False
),
for_cn
=
config
.
get
(
'for_cn'
,
False
),
random_seed
=
config
.
get
(
'seed'
,
None
))
self
.
_reader
=
reader
self
.
_dev_count
=
dev_count
self
.
_batch_size
=
config
[
'batch_size'
]
self
.
_max_seq_len
=
config
[
'max_seq_len'
]
if
phase
==
'train'
:
self
.
_input_file
=
config
[
'train_file'
]
self
.
_num_epochs
=
None
# 防止iteartor终止
self
.
_shuffle
=
config
.
get
(
'shuffle'
,
False
)
self
.
_shuffle_buffer
=
config
.
get
(
'shuffle_buffer'
,
5000
)
elif
phase
==
'eval'
:
self
.
_input_file
=
config
[
'dev_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
elif
phase
==
'pred'
:
self
.
_input_file
=
config
[
'pred_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
self
.
_phase
=
phase
# self._batch_size =
self
.
_print_first_n
=
config
.
get
(
'print_first_n'
,
1
)
@
property
def
outputs_attr
(
self
):
if
self
.
_is_training
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"label_ids"
:
[[
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
]
}
else
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
]
}
def
load_data
(
self
):
self
.
_data_generator
=
self
.
_reader
.
data_generator
(
self
.
_input_file
,
self
.
_batch_size
,
self
.
_num_epochs
,
dev_count
=
self
.
_dev_count
,
shuffle
=
self
.
_shuffle
,
phase
=
self
.
_phase
)
def
iterator
(
self
):
def
list_to_dict
(
x
):
names
=
[
'token_ids'
,
'position_ids'
,
'segment_ids'
,
'input_mask'
,
'task_ids'
,
'mask_label'
,
'mask_pos'
]
outputs
=
{
n
:
i
for
n
,
i
in
zip
(
names
,
x
)}
del
outputs
[
'unique_ids'
]
if
not
self
.
_is_training
:
del
outputs
[
'label_ids'
]
return
outputs
for
batch
in
self
.
_data_generator
():
yield
list_to_dict
(
batch
)
def
get_epoch_outputs
(
self
):
return
{
'examples'
:
self
.
_reader
.
get_examples
(
self
.
_phase
),
'features'
:
self
.
_reader
.
get_features
(
self
.
_phase
)}
@
property
def
num_examples
(
self
):
return
self
.
_reader
.
get_num_examples
(
phase
=
self
.
_phase
)
build/lib/paddlepalm/reader/mrc4bert.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
paddlepalm.interface
import
reader
from
paddlepalm.utils.textprocess_helper
import
is_whitespace
from
paddlepalm.reader.utils.mrqa_helper
import
MRQAExample
,
MRQAFeature
import
paddlepalm.tokenizer.bert_tokenizer
as
tokenization
class
Reader
(
reader
):
def
__init__
(
self
,
config
,
phase
=
'train'
,
dev_count
=
1
,
print_prefix
=
''
):
"""
Args:
phase: train, eval, pred
"""
self
.
_is_training
=
phase
==
'train'
self
.
_tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
config
[
'vocab_path'
],
do_lower_case
=
config
.
get
(
'do_lower_case'
,
False
))
self
.
_max_seq_length
=
config
[
'max_seq_len'
]
self
.
_doc_stride
=
config
[
'doc_stride'
]
self
.
_max_query_length
=
config
[
'max_query_len'
]
if
phase
==
'train'
:
self
.
_input_file
=
config
[
'train_file'
]
self
.
_num_epochs
=
config
[
'num_epochs'
]
self
.
_shuffle
=
config
.
get
(
'shuffle'
,
False
)
self
.
_shuffle_buffer
=
config
.
get
(
'shuffle_buffer'
,
5000
)
if
phase
==
'eval'
:
self
.
_input_file
=
config
[
'dev_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
elif
phase
==
'pred'
:
self
.
_input_file
=
config
[
'predict_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
# self._batch_size =
self
.
_batch_size
=
config
[
'batch_size'
]
self
.
_pred_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
self
.
_print_first_n
=
config
.
get
(
'print_first_n'
,
1
)
self
.
_with_negative
=
config
.
get
(
'with_negative'
,
False
)
self
.
_sample_rate
=
config
.
get
(
'sample_rate'
,
0.02
)
# TODO: without slide window version
self
.
_with_slide_window
=
config
.
get
(
'with_slide_window'
,
False
)
self
.
vocab
=
self
.
_tokenizer
.
vocab
self
.
vocab_size
=
len
(
self
.
vocab
)
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
mask_id
=
self
.
vocab
[
"[MASK]"
]
self
.
current_train_example
=
-
1
self
.
num_train_examples
=
-
1
self
.
current_train_epoch
=
-
1
self
.
n_examples
=
None
print
(
print_prefix
+
'reading raw data...'
)
with
open
(
input_file
,
"r"
)
as
reader
:
self
.
raw_data
=
json
.
load
(
reader
)[
"data"
]
print
(
print_prfix
+
'done!'
)
@
property
def
outputs_attr
(
self
):
if
self
.
_is_training
:
return
{
"token_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'float32'
],
"start_positions"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"end_positions"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
]
}
else
:
return
{
"token_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
self
.
max_seq_len
,
1
],
'float32'
],
"unique_ids"
:
[[
-
1
,
1
],
'int64'
]
}
def
iterator
(
self
):
features
=
[]
for
i
in
self
.
_num_epochs
:
if
self
.
_is_training
:
print
(
self
.
print_prefix
+
'{} epoch {} {}'
.
format
(
'-'
*
16
,
i
,
'-'
*
16
))
example_id
=
0
feature_id
=
1000000000
for
line
in
self
.
train_file
:
raw
=
self
.
parse_line
(
line
)
examples
=
_raw_to_examples
(
raw
[
'context'
],
raw
[
'qa_list'
],
is_training
=
self
.
_is_training
)
for
example
in
examples
:
features
.
extend
(
_example_to_features
(
example
,
example_id
,
self
.
_tokenizer
,
\
self
.
_max_seq_length
,
self
.
_doc_stride
,
self
.
_max_query_length
,
\
id_offset
=
1000000000
+
len
(
features
),
is_training
=
self
.
_is_training
))
if
len
(
features
)
>=
self
.
_batch_size
*
self
.
_dev_count
:
for
batch
,
total_token_num
in
_features_to_batches
(
\
features
[:
self
.
_batch_size
*
self
.
_dev_count
],
\
batch_size
,
in_tokens
=
self
.
_in_tokens
):
temp
=
prepare_batch_data
(
batch
,
total_token_num
,
\
max_len
=
self
.
_max_seq_length
,
voc_size
=-
1
,
\
pad_id
=
self
.
pad_id
,
cls_id
=
self
.
cls_id
,
sep_id
=
self
.
sep_id
,
mask_id
=-
1
,
\
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
if
self
.
_is_training
:
tok_ids
,
pos_ids
,
seg_ids
,
input_mask
,
start_positions
,
end_positions
=
temp
yield
{
"token_ids"
:
tok_ids
,
"position_ids"
:
pos_ids
,
"segment_ids"
:
seg_ids
,
"input_mask"
:
input_mask
,
"start_positions"
:
start_positions
,
'end_positions'
:
end_positions
}
else
:
tok_ids
,
pos_ids
,
seg_ids
,
input_mask
,
unique_ids
=
temp
yield
{
"token_ids"
:
tok_ids
,
"position_ids"
:
pos_ids
,
"segment_ids"
:
seg_ids
,
"input_mask"
:
input_mask
,
"unique_ids"
:
unique_ids
}
features
=
features
[
self
.
_batch_size
*
self
.
_dev_count
:]
example_id
+=
1
# The last batch may be discarded when running with distributed prediction, so we build some fake batches for the last prediction step.
if
self
.
_is_training
and
len
(
features
)
>
0
:
pred_batches
=
[]
for
batch
,
total_token_num
in
_features_to_batches
(
\
features
[:
self
.
_batch_size
*
self
.
_dev_count
],
\
batch_size
,
in_tokens
=
self
.
_in_tokens
):
pred_batches
.
append
(
prepare_batch_data
(
batch
,
total_token_num
,
max_len
=
self
.
_max_seq_length
,
voc_size
=-
1
,
pad_id
=
self
.
pad_id
,
cls_id
=
self
.
cls_id
,
sep_id
=
self
.
sep_id
,
mask_id
=-
1
,
\
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
))
fake_batch
=
pred_batches
[
-
1
]
fake_batch
=
fake_batch
[:
-
1
]
+
[
np
.
array
([
-
1
]
*
len
(
fake_batch
[
0
]))]
pred_batches
=
pred_batches
+
[
fake_batch
]
*
(
dev_count
-
len
(
pred_batches
))
for
batch
in
pred_batches
:
yield
batch
@
property
def
num_examples
(
self
):
if
self
.
n_examples
is
None
:
self
.
n_examples
=
_estimate_runtime_examples
(
self
.
raw_data
,
self
.
_sample_rate
,
self
.
_tokenizer
,
\
self
.
_max_seq_length
,
self
.
_doc_stride
,
self
.
_max_query_length
,
\
remove_impossible_questions
=
True
,
filter_invalid_spans
=
True
)
return
self
.
n_examples
# return math.ceil(n_examples * self._num_epochs / float(self._batch_size * self._dev_count))
def
_raw_to_examples
(
context
,
qa_list
,
is_training
=
True
,
remove_impossible_questions
=
True
,
filter_invalid_spans
=
True
):
"""
Args:
context: (str) the paragraph that provide information for QA
qa_list: (list) nested dict. Each element in qa_list should contain at least 'id' and 'question'. And the ....
"""
examples
=
[]
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
for
c
in
context
:
if
is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
for
qa
in
qa_list
:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position
=
None
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
is_training
:
assert
len
(
qa
[
"answers"
])
==
1
,
"For training, each question should have exactly 1 answer."
if
(
'is_impossible'
in
qa
)
and
(
qa
[
"is_impossible"
]):
if
remove_impossible_questions
or
filter_invalid_spans
:
continue
else
:
start_position
=
-
1
end_position
=
-
1
orig_answer_text
=
""
is_impossible
=
True
else
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
start_position
=
char_to_word_offset
[
answer_offset
]
end_position
=
char_to_word_offset
[
answer_offset
+
answer_length
-
1
]
# remove corrupt samples
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
tokenization
.
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
print
(
self
.
print_prefix
+
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
examples
.
append
(
MRQAExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
is_impossible
))
return
examples
def
_example_to_features
(
example
,
example_id
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
id_offset
,
is_training
):
query_tokens
=
tokenizer
.
tokenize
(
example
.
question_text
)
if
len
(
query_tokens
)
>
max_query_length
:
query_tokens
=
query_tokens
[
0
:
max_query_length
]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
tok_start_position
=
None
tok_end_position
=
None
if
is_training
and
example
.
is_impossible
:
tok_start_position
=
-
1
tok_end_position
=
-
1
if
is_training
and
not
example
.
is_impossible
:
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
(
tok_start_position
,
tok_end_position
)
=
_improve_answer_span
(
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
example
.
orig_answer_text
)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"DocSpan"
,
[
"start"
,
"length"
])
doc_spans
=
[]
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
doc_spans
.
append
(
_DocSpan
(
start
=
start_offset
,
length
=
length
))
if
start_offset
+
length
==
len
(
all_doc_tokens
):
break
start_offset
+=
min
(
length
,
doc_stride
)
for
(
doc_span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
tokens
=
[]
token_to_orig_map
=
{}
token_is_max_context
=
{}
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
start_position
=
None
end_position
=
None
if
is_training
and
not
example
.
is_impossible
:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
0
end_position
=
0
continue
else
:
doc_offset
=
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
if
is_training
and
example
.
is_impossible
:
start_position
=
0
end_position
=
0
def
format_print
():
print
(
"*** Example ***"
)
print
(
"unique_id: %s"
%
(
unique_id
))
print
(
"example_index: %s"
%
(
example_index
))
print
(
"doc_span_index: %s"
%
(
doc_span_index
))
print
(
"tokens: %s"
%
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
tokens
]))
print
(
"token_to_orig_map: %s"
%
" "
.
join
([
"%d:%d"
%
(
x
,
y
)
for
(
x
,
y
)
in
six
.
iteritems
(
token_to_orig_map
)
]))
print
(
"token_is_max_context: %s"
%
" "
.
join
([
"%d:%s"
%
(
x
,
y
)
for
(
x
,
y
)
in
six
.
iteritems
(
token_is_max_context
)
]))
print
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
print
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
print
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
if
is_training
and
example
.
is_impossible
:
print
(
"impossible example"
)
if
is_training
and
not
example
.
is_impossible
:
answer_text
=
" "
.
join
(
tokens
[
start_position
:(
end_position
+
1
)])
print
(
"start_position: %d"
%
(
start_position
))
print
(
"end_position: %d"
%
(
end_position
))
print
(
"answer: %s"
%
(
tokenization
.
printable_text
(
answer_text
)))
if
self
.
_print_first_n
>
0
:
format_print
()
self
.
_print_first_n
-=
1
features
.
append
(
MRQAFeature
(
unique_id
=
id_offset
,
example_index
=
example_id
,
doc_span_index
=
doc_span_index
,
tokens
=
tokens
,
token_to_orig_map
=
token_to_orig_map
,
token_is_max_context
=
token_is_max_context
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
example
.
is_impossible
))
id_offset
+=
1
return
features
def
_features_to_batches
(
features
,
batch_size
,
in_tokens
):
batch
,
total_token_num
,
max_len
=
[],
0
,
0
for
(
index
,
feature
)
in
enumerate
(
features
):
if
phase
==
'train'
:
self
.
current_train_example
=
index
+
1
seq_len
=
len
(
feature
.
input_ids
)
labels
=
[
feature
.
unique_id
]
if
feature
.
start_position
is
None
else
[
feature
.
start_position
,
feature
.
end_position
]
example
=
[
feature
.
input_ids
,
feature
.
segment_ids
,
range
(
seq_len
)
]
+
labels
max_len
=
max
(
max_len
,
seq_len
)
if
in_tokens
:
to_append
=
(
len
(
batch
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch
)
<
batch_size
if
to_append
:
batch
.
append
(
example
)
total_token_num
+=
seq_len
else
:
yield
batch
,
total_token_num
batch
,
total_token_num
,
max_len
=
[
example
],
seq_len
,
seq_len
if
len
(
batch
)
>
0
:
yield
batch
,
total_token_num
def
_estimate_runtime_examples
(
data
,
sample_rate
,
tokenizer
,
\
max_seq_length
,
doc_stride
,
max_query_length
,
\
remove_impossible_questions
=
True
,
filter_invalid_spans
=
True
):
"""Count runtime examples which may differ from number of raw samples due to sliding window operation and etc..
This is useful to get correct warmup steps for training."""
assert
sample_rate
>
0.0
and
sample_rate
<=
1.0
,
"sample_rate must be set between 0.0~1.0"
num_raw_examples
=
0
for
entry
in
data
:
for
paragraph
in
entry
[
"paragraphs"
]:
paragraph_text
=
paragraph
[
"context"
]
for
qa
in
paragraph
[
"qas"
]:
num_raw_examples
+=
1
# print("num raw examples:{}".format(num_raw_examples))
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
sampled_examples
=
[]
first_samp
=
True
for
entry
in
data
:
for
paragraph
in
entry
[
"paragraphs"
]:
doc_tokens
=
None
for
qa
in
paragraph
[
"qas"
]:
if
not
first_samp
and
random
.
random
()
>
sample_rate
and
sample_rate
<
1.0
:
continue
if
doc_tokens
is
None
:
paragraph_text
=
paragraph
[
"context"
]
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
for
c
in
paragraph_text
:
if
is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
assert
len
(
qa
[
"answers"
])
==
1
,
"For training, each question should have exactly 1 answer."
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position
=
None
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
(
'is_impossible'
in
qa
)
and
(
qa
[
"is_impossible"
]):
if
remove_impossible_questions
or
filter_invalid_spans
:
continue
else
:
start_position
=
-
1
end_position
=
-
1
orig_answer_text
=
""
is_impossible
=
True
else
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
start_position
=
char_to_word_offset
[
answer_offset
]
end_position
=
char_to_word_offset
[
answer_offset
+
answer_length
-
1
]
# remove corrupt samples
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
tokenization
.
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
continue
example
=
MRQAExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
is_impossible
)
sampled_examples
.
append
(
example
)
first_samp
=
False
runtime_sample_rate
=
len
(
sampled_examples
)
/
float
(
num_raw_examples
)
runtime_samp_cnt
=
0
for
example
in
sampled_examples
:
query_tokens
=
tokenizer
.
tokenize
(
example
.
question_text
)
if
len
(
query_tokens
)
>
max_query_length
:
query_tokens
=
query_tokens
[
0
:
max_query_length
]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
tok_start_position
=
None
tok_end_position
=
None
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
(
tok_start_position
,
tok_end_position
)
=
_improve_answer_span
(
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
example
.
orig_answer_text
)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
_DocSpan
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"DocSpan"
,
[
"start"
,
"length"
])
doc_spans
=
[]
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
doc_spans
.
append
(
_DocSpan
(
start
=
start_offset
,
length
=
length
))
if
start_offset
+
length
==
len
(
all_doc_tokens
):
break
start_offset
+=
min
(
length
,
doc_stride
)
for
(
doc_span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
if
filter_invalid_spans
and
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
continue
runtime_samp_cnt
+=
1
return
int
(
runtime_samp_cnt
/
runtime_sample_rate
)
def
_improve_answer_span
(
doc_tokens
,
input_start
,
input_end
,
tokenizer
,
orig_answer_text
):
"""Returns tokenized answer spans that better match the annotated answer."""
# The MRQA annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in MRQA, but does happen.
tok_answer_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_answer_text
))
for
new_start
in
range
(
input_start
,
input_end
+
1
):
for
new_end
in
range
(
input_end
,
new_start
-
1
,
-
1
):
text_span
=
" "
.
join
(
doc_tokens
[
new_start
:(
new_end
+
1
)])
if
text_span
==
tok_answer_text
:
return
(
new_start
,
new_end
)
return
(
input_start
,
input_end
)
def
_check_is_max_context
(
doc_spans
,
cur_span_index
,
position
):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score
=
None
best_span_index
=
None
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
end
=
doc_span
.
start
+
doc_span
.
length
-
1
if
position
<
doc_span
.
start
:
continue
if
position
>
end
:
continue
num_left_context
=
position
-
doc_span
.
start
num_right_context
=
end
-
position
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
.
length
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
best_span_index
=
span_index
return
cur_span_index
==
best_span_index
build/lib/paddlepalm/reader/mrc4ernie.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddlepalm.interface
import
reader
from
paddlepalm.reader.utils.reader4ernie
import
MRCReader
class
Reader
(
reader
):
def
__init__
(
self
,
config
,
phase
=
'train'
,
dev_count
=
1
,
print_prefix
=
''
):
"""
Args:
phase: train, eval, pred
"""
self
.
_is_training
=
phase
==
'train'
reader
=
MRCReader
(
config
[
'vocab_path'
],
max_seq_len
=
config
[
'max_seq_len'
],
do_lower_case
=
config
.
get
(
'do_lower_case'
,
False
),
tokenizer
=
'FullTokenizer'
,
for_cn
=
config
.
get
(
'for_cn'
,
False
),
doc_stride
=
config
[
'doc_stride'
],
max_query_length
=
config
[
'max_query_len'
],
random_seed
=
config
.
get
(
'seed'
,
None
))
self
.
_reader
=
reader
self
.
_dev_count
=
dev_count
self
.
_batch_size
=
config
[
'batch_size'
]
self
.
_max_seq_len
=
config
[
'max_seq_len'
]
if
phase
==
'train'
:
self
.
_input_file
=
config
[
'train_file'
]
# self._num_epochs = config['num_epochs']
self
.
_num_epochs
=
None
# 防止iteartor终止
self
.
_shuffle
=
config
.
get
(
'shuffle'
,
False
)
self
.
_shuffle_buffer
=
config
.
get
(
'shuffle_buffer'
,
5000
)
if
phase
==
'eval'
:
self
.
_input_file
=
config
[
'dev_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
elif
phase
==
'pred'
:
self
.
_input_file
=
config
[
'pred_file'
]
self
.
_num_epochs
=
1
self
.
_shuffle
=
False
self
.
_batch_size
=
config
.
get
(
'pred_batch_size'
,
self
.
_batch_size
)
self
.
_phase
=
phase
# self._batch_size =
self
.
_print_first_n
=
config
.
get
(
'print_first_n'
,
1
)
# TODO: without slide window version
self
.
_with_slide_window
=
config
.
get
(
'with_slide_window'
,
False
)
@
property
def
outputs_attr
(
self
):
if
self
.
_is_training
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"start_positions"
:
[[
-
1
,
1
],
'int64'
],
"end_positions"
:
[[
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
]
}
else
:
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"unique_ids"
:
[[
-
1
,
1
],
'int64'
]
}
@
property
def
epoch_outputs_attr
(
self
):
if
not
self
.
_is_training
:
return
{
"examples"
:
None
,
"features"
:
None
}
def
load_data
(
self
):
self
.
_data_generator
=
self
.
_reader
.
data_generator
(
self
.
_input_file
,
self
.
_batch_size
,
self
.
_num_epochs
,
dev_count
=
self
.
_dev_count
,
shuffle
=
self
.
_shuffle
,
phase
=
self
.
_phase
)
def
iterator
(
self
):
def
list_to_dict
(
x
):
names
=
[
'token_ids'
,
'segment_ids'
,
'position_ids'
,
'task_ids'
,
'input_mask'
,
'start_positions'
,
'end_positions'
,
'unique_ids'
]
outputs
=
{
n
:
i
for
n
,
i
in
zip
(
names
,
x
)}
if
self
.
_is_training
:
del
outputs
[
'unique_ids'
]
else
:
del
outputs
[
'start_positions'
]
del
outputs
[
'end_positions'
]
return
outputs
for
batch
in
self
.
_data_generator
():
yield
list_to_dict
(
batch
)
def
get_epoch_outputs
(
self
):
return
{
'examples'
:
self
.
_reader
.
get_examples
(
self
.
_phase
),
'features'
:
self
.
_reader
.
get_features
(
self
.
_phase
)}
@
property
def
num_examples
(
self
):
return
self
.
_reader
.
get_num_examples
(
phase
=
self
.
_phase
)
build/lib/paddlepalm/reader/utils/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/reader/utils/batching4bert.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
def
mask
(
batch_tokens
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len
=
max
([
len
(
sent
)
for
sent
in
batch_tokens
])
mask_label
=
[]
mask_pos
=
[]
prob_mask
=
np
.
random
.
rand
(
total_token_num
)
# Note: the first token is [CLS], so [low=1]
replace_ids
=
np
.
random
.
randint
(
1
,
high
=
vocab_size
,
size
=
total_token_num
)
pre_sent_len
=
0
prob_index
=
0
for
sent_index
,
sent
in
enumerate
(
batch_tokens
):
mask_flag
=
False
prob_index
+=
pre_sent_len
for
token_index
,
token
in
enumerate
(
sent
):
prob
=
prob_mask
[
prob_index
+
token_index
]
if
prob
>
0.15
:
continue
elif
0.03
<
prob
<=
0.15
:
# mask
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
elif
0.015
<
prob
<=
0.03
:
# random replace
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
else
:
# keep the original token
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
pre_sent_len
=
len
(
sent
)
# ensure at least mask one word in a sentence
while
not
mask_flag
:
token_index
=
int
(
np
.
random
.
randint
(
1
,
high
=
len
(
sent
)
-
1
,
size
=
1
))
if
sent
[
token_index
]
!=
SEP
and
sent
[
token_index
]
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
return
batch_tokens
,
mask_label
,
mask_pos
def
prepare_batch_data
(
insts
,
total_token_num
,
max_len
=
None
,
voc_size
=
0
,
pad_id
=
None
,
cls_id
=
None
,
sep_id
=
None
,
mask_id
=
None
,
return_input_mask
=
True
,
return_max_len
=
True
,
return_num_token
=
False
):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids
=
[
inst
[
0
]
for
inst
in
insts
]
batch_sent_ids
=
[
inst
[
1
]
for
inst
in
insts
]
batch_pos_ids
=
[
inst
[
2
]
for
inst
in
insts
]
labels_list
=
[]
# compatible with mrqa, whose example includes start/end positions,
# or unique id
for
i
in
range
(
3
,
len
(
insts
[
0
]),
1
):
labels
=
[
inst
[
i
]
for
inst
in
insts
]
labels
=
np
.
array
(
labels
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
labels_list
.
append
(
labels
)
# First step: do mask without padding
if
mask_id
>=
0
:
out
,
mask_label
,
mask_pos
=
mask
(
batch_src_ids
,
total_token_num
,
vocab_size
=
voc_size
,
CLS
=
cls_id
,
SEP
=
sep_id
,
MASK
=
mask_id
)
else
:
out
=
batch_src_ids
# Second step: padding
src_id
,
self_input_mask
=
pad_batch_data
(
out
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_input_mask
=
True
)
pos_id
=
pad_batch_data
(
batch_pos_ids
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
sent_id
=
pad_batch_data
(
batch_sent_ids
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
if
mask_id
>=
0
:
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_input_mask
,
mask_label
,
mask_pos
]
+
labels_list
else
:
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_input_mask
]
+
labels_list
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
def
pad_batch_data
(
insts
,
max_len
=
None
,
pad_idx
=
0
,
return_pos
=
False
,
return_input_mask
=
False
,
return_max_len
=
False
,
return_num_token
=
False
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list
=
[]
if
max_len
is
None
:
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data
=
np
.
array
([
list
(
inst
)
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
input_mask_data
=
np
.
expand_dims
(
input_mask_data
,
axis
=-
1
)
return_list
+=
[
input_mask_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
if
return_num_token
:
num_token
=
0
for
inst
in
insts
:
num_token
+=
len
(
inst
)
return_list
+=
[
num_token
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
if
__name__
==
"__main__"
:
pass
build/lib/paddlepalm/reader/utils/batching4ernie.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
from
six.moves
import
xrange
def
mask
(
batch_tokens
,
seg_labels
,
mask_word_tags
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len
=
max
([
len
(
sent
)
for
sent
in
batch_tokens
])
mask_label
=
[]
mask_pos
=
[]
prob_mask
=
np
.
random
.
rand
(
total_token_num
)
# Note: the first token is [CLS], so [low=1]
replace_ids
=
np
.
random
.
randint
(
1
,
high
=
vocab_size
,
size
=
total_token_num
)
pre_sent_len
=
0
prob_index
=
0
for
sent_index
,
sent
in
enumerate
(
batch_tokens
):
mask_flag
=
False
mask_word
=
mask_word_tags
[
sent_index
]
prob_index
+=
pre_sent_len
if
mask_word
:
beg
=
0
for
token_index
,
token
in
enumerate
(
sent
):
seg_label
=
seg_labels
[
sent_index
][
token_index
]
if
seg_label
==
1
:
continue
if
beg
==
0
:
if
seg_label
!=
-
1
:
beg
=
token_index
continue
prob
=
prob_mask
[
prob_index
+
beg
]
if
prob
>
0.15
:
pass
else
:
for
index
in
xrange
(
beg
,
token_index
):
prob
=
prob_mask
[
prob_index
+
index
]
base_prob
=
1.0
if
index
==
beg
:
base_prob
=
0.15
if
base_prob
*
0.2
<
prob
<=
base_prob
:
mask_label
.
append
(
sent
[
index
])
sent
[
index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
index
)
elif
base_prob
*
0.1
<
prob
<=
base_prob
*
0.2
:
mask_label
.
append
(
sent
[
index
])
sent
[
index
]
=
replace_ids
[
prob_index
+
index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
index
)
else
:
mask_label
.
append
(
sent
[
index
])
mask_pos
.
append
(
sent_index
*
max_len
+
index
)
if
seg_label
==
-
1
:
beg
=
0
else
:
beg
=
token_index
else
:
for
token_index
,
token
in
enumerate
(
sent
):
prob
=
prob_mask
[
prob_index
+
token_index
]
if
prob
>
0.15
:
continue
elif
0.03
<
prob
<=
0.15
:
# mask
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
elif
0.015
<
prob
<=
0.03
:
# random replace
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
else
:
# keep the original token
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
pre_sent_len
=
len
(
sent
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
return
batch_tokens
,
mask_label
,
mask_pos
def
pad_batch_data
(
insts
,
pad_idx
=
0
,
return_pos
=
False
,
return_input_mask
=
False
,
return_max_len
=
False
,
return_num_token
=
False
,
return_seq_lens
=
False
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data
=
np
.
array
(
[
inst
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
input_mask_data
=
np
.
expand_dims
(
input_mask_data
,
axis
=-
1
)
return_list
+=
[
input_mask_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
if
return_num_token
:
num_token
=
0
for
inst
in
insts
:
num_token
+=
len
(
inst
)
return_list
+=
[
num_token
]
if
return_seq_lens
:
seq_lens
=
np
.
array
([
len
(
inst
)
for
inst
in
insts
])
return_list
+=
[
seq_lens
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
if
__name__
==
"__main__"
:
pass
build/lib/paddlepalm/reader/utils/mlm_batching.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
def
mask
(
batch_tokens
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len
=
max
([
len
(
sent
)
for
sent
in
batch_tokens
])
mask_label
=
[]
mask_pos
=
[]
prob_mask
=
np
.
random
.
rand
(
total_token_num
)
# Note: the first token is [CLS], so [low=1]
replace_ids
=
np
.
random
.
randint
(
1
,
high
=
vocab_size
,
size
=
total_token_num
)
pre_sent_len
=
0
prob_index
=
0
for
sent_index
,
sent
in
enumerate
(
batch_tokens
):
mask_flag
=
False
prob_index
+=
pre_sent_len
for
token_index
,
token
in
enumerate
(
sent
):
prob
=
prob_mask
[
prob_index
+
token_index
]
if
prob
>
0.15
:
continue
elif
0.03
<
prob
<=
0.15
:
# mask
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
elif
0.015
<
prob
<=
0.03
:
# random replace
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
else
:
# keep the original token
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
pre_sent_len
=
len
(
sent
)
# ensure at least mask one word in a sentence
while
not
mask_flag
:
token_index
=
int
(
np
.
random
.
randint
(
1
,
high
=
len
(
sent
)
-
1
,
size
=
1
))
if
sent
[
token_index
]
!=
SEP
and
sent
[
token_index
]
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
return
batch_tokens
,
mask_label
,
mask_pos
def
prepare_batch_data
(
insts
,
total_token_num
,
max_len
=
None
,
voc_size
=
0
,
pad_id
=
None
,
cls_id
=
None
,
sep_id
=
None
,
mask_id
=
None
,
task_id
=
0
,
return_input_mask
=
True
,
return_max_len
=
True
,
return_num_token
=
False
):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids
=
[
inst
[
0
]
for
inst
in
insts
]
batch_sent_ids
=
[
inst
[
1
]
for
inst
in
insts
]
batch_pos_ids
=
[
inst
[
2
]
for
inst
in
insts
]
# First step: do mask without padding
out
,
mask_label
,
mask_pos
=
mask
(
batch_src_ids
,
total_token_num
,
vocab_size
=
voc_size
,
CLS
=
cls_id
,
SEP
=
sep_id
,
MASK
=
mask_id
)
# Second step: padding
src_id
,
self_input_mask
=
pad_batch_data
(
out
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_input_mask
=
True
)
pos_id
=
pad_batch_data
(
batch_pos_ids
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
sent_id
=
pad_batch_data
(
batch_sent_ids
,
max_len
=
max_len
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
task_ids
=
np
.
ones_like
(
src_id
,
dtype
=
"int64"
)
*
task_id
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_input_mask
,
task_ids
,
mask_label
,
mask_pos
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
def
pad_batch_data
(
insts
,
max_len
=
None
,
pad_idx
=
0
,
return_pos
=
False
,
return_input_mask
=
False
,
return_max_len
=
False
,
return_num_token
=
False
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list
=
[]
if
max_len
is
None
:
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data
=
np
.
array
([
list
(
inst
)
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
input_mask_data
=
np
.
expand_dims
(
input_mask_data
,
axis
=-
1
)
return_list
+=
[
input_mask_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
if
return_num_token
:
num_token
=
0
for
inst
in
insts
:
num_token
+=
len
(
inst
)
return_list
+=
[
num_token
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
if
__name__
==
"__main__"
:
pass
build/lib/paddlepalm/reader/utils/mrqa_helper.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class
MRQAExample
(
object
):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def
__init__
(
self
,
qas_id
,
question_text
,
doc_tokens
,
orig_answer_text
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
False
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
self
.
doc_tokens
=
doc_tokens
self
.
orig_answer_text
=
orig_answer_text
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
__str__
(
self
):
return
self
.
__repr__
()
def
__repr__
(
self
):
s
=
""
s
+=
"qas_id: %s"
%
(
tokenization
.
printable_text
(
self
.
qas_id
))
s
+=
", question_text: %s"
%
(
tokenization
.
printable_text
(
self
.
question_text
))
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
if
self
.
start_position
:
s
+=
", start_position: %d"
%
(
self
.
start_position
)
if
self
.
start_position
:
s
+=
", end_position: %d"
%
(
self
.
end_position
)
if
self
.
start_position
:
s
+=
", is_impossible: %r"
%
(
self
.
is_impossible
)
return
s
class
MRQAFeature
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
unique_id
,
example_index
,
doc_span_index
,
tokens
,
token_to_orig_map
,
token_is_max_context
,
input_ids
,
input_mask
,
segment_ids
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
self
.
unique_id
=
unique_id
self
.
example_index
=
example_index
self
.
doc_span_index
=
doc_span_index
self
.
tokens
=
tokens
self
.
token_to_orig_map
=
token_to_orig_map
self
.
token_is_max_context
=
token_is_max_context
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
build/lib/paddlepalm/reader/utils/reader4ernie.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
unicode_literals
from
__future__
import
absolute_import
import
sys
import
os
import
json
import
random
import
logging
import
numpy
as
np
import
six
from
io
import
open
from
collections
import
namedtuple
import
paddlepalm.tokenizer.ernie_tokenizer
as
tokenization
from
paddlepalm.reader.utils.batching4ernie
import
pad_batch_data
from
paddlepalm.reader.utils.mlm_batching
import
prepare_batch_data
log
=
logging
.
getLogger
(
__name__
)
if
six
.
PY3
:
import
io
sys
.
stdout
=
io
.
TextIOWrapper
(
sys
.
stdout
.
buffer
,
encoding
=
'utf-8'
)
sys
.
stderr
=
io
.
TextIOWrapper
(
sys
.
stderr
.
buffer
,
encoding
=
'utf-8'
)
def
csv_reader
(
fd
,
delimiter
=
'
\t
'
):
def
gen
():
for
i
in
fd
:
slots
=
i
.
rstrip
(
'
\n
'
).
split
(
delimiter
)
if
len
(
slots
)
==
1
:
yield
slots
,
else
:
yield
slots
return
gen
()
class
BaseReader
(
object
):
def
__init__
(
self
,
vocab_path
,
label_map_config
=
None
,
max_seq_len
=
512
,
do_lower_case
=
True
,
in_tokens
=
False
,
is_inference
=
False
,
random_seed
=
None
,
tokenizer
=
"FullTokenizer"
,
is_classify
=
True
,
is_regression
=
False
,
for_cn
=
True
,
task_id
=
0
):
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
self
.
vocab
=
self
.
tokenizer
.
vocab
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
in_tokens
=
in_tokens
self
.
is_inference
=
is_inference
self
.
for_cn
=
for_cn
self
.
task_id
=
task_id
np
.
random
.
seed
(
random_seed
)
self
.
is_classify
=
is_classify
self
.
is_regression
=
is_regression
self
.
current_example
=
0
self
.
current_epoch
=
0
self
.
num_examples
=
0
self
.
examples
=
{}
if
label_map_config
:
with
open
(
label_map_config
,
encoding
=
'utf8'
)
as
f
:
self
.
label_map
=
json
.
load
(
f
)
else
:
self
.
label_map
=
None
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_example
,
self
.
current_epoch
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
'r'
,
encoding
=
'utf8'
)
as
f
:
reader
=
csv_reader
(
f
)
headers
=
next
(
reader
)
Example
=
namedtuple
(
'Example'
,
headers
)
examples
=
[]
for
line
in
reader
:
example
=
Example
(
*
line
)
examples
.
append
(
example
)
return
examples
def
_truncate_seq_pair
(
self
,
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
"""Converts a single `Example` into a single `Record`."""
text_a
=
tokenization
.
convert_to_unicode
(
example
.
text_a
)
tokens_a
=
tokenizer
.
tokenize
(
text_a
)
tokens_b
=
None
has_text_b
=
False
if
isinstance
(
example
,
dict
):
has_text_b
=
"text_b"
in
example
.
keys
()
else
:
has_text_b
=
"text_b"
in
example
.
_fields
if
has_text_b
:
text_b
=
tokenization
.
convert_to_unicode
(
example
.
text_b
)
tokens_b
=
tokenizer
.
tokenize
(
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self
.
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[]
text_type_ids
=
[]
tokens
.
append
(
"[CLS]"
)
text_type_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
0
)
if
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
1
)
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
if
self
.
is_inference
:
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
)
else
:
if
self
.
label_map
:
label_id
=
self
.
label_map
[
example
.
label
]
else
:
label_id
=
example
.
label
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_id'
,
'qid'
])
qid
=
None
if
"qid"
in
example
.
_fields
:
qid
=
example
.
qid
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
,
label_id
=
label_id
,
qid
=
qid
)
return
record
def
_prepare_batch_data
(
self
,
examples
,
batch_size
,
phase
=
None
):
"""generate batch records"""
batch_records
,
max_len
=
[],
0
for
index
,
example
in
enumerate
(
examples
):
if
phase
==
"train"
:
self
.
current_example
=
index
record
=
self
.
_convert_example_to_record
(
example
,
self
.
max_seq_len
,
self
.
tokenizer
)
max_len
=
max
(
max_len
,
len
(
record
.
token_ids
))
if
self
.
in_tokens
:
to_append
=
(
len
(
batch_records
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch_records
)
<
batch_size
if
to_append
:
batch_records
.
append
(
record
)
else
:
yield
self
.
_pad_batch_records
(
batch_records
)
batch_records
,
max_len
=
[
record
],
len
(
record
.
token_ids
)
if
phase
==
'pred'
and
batch_records
:
print
(
'the last batch yielded.'
)
yield
self
.
_pad_batch_records
(
batch_records
)
def
get_num_examples
(
self
,
input_file
=
None
,
phase
=
None
):
if
self
.
examples
is
not
None
:
if
phase
is
None
:
phase
=
'all'
return
len
(
self
.
examples
[
phase
])
else
:
assert
input_file
is
not
None
,
"Argument input_file should be given or the data_generator should be created when this func is called."
examples
=
self
.
_read_tsv
(
input_file
)
return
len
(
examples
)
def
data_generator
(
self
,
input_file
,
batch_size
,
epoch
,
dev_count
=
1
,
shuffle
=
True
,
phase
=
None
):
examples
=
self
.
_read_tsv
(
input_file
)
if
phase
is
None
:
phase
=
'all'
self
.
examples
[
phase
]
=
examples
def
wrapper
():
all_dev_batches
=
[]
if
epoch
is
None
:
num_epochs
=
99999999
else
:
num_epochs
=
epoch
for
epoch_index
in
range
(
num_epochs
):
if
phase
==
"train"
:
self
.
current_example
=
0
self
.
current_epoch
=
epoch_index
if
shuffle
:
np
.
random
.
shuffle
(
examples
)
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
if
len
(
all_dev_batches
)
<
dev_count
:
all_dev_batches
.
append
(
batch_data
)
if
len
(
all_dev_batches
)
==
dev_count
:
for
batch
in
all_dev_batches
:
yield
batch
all_dev_batches
=
[]
def
f
():
for
i
in
wrapper
():
yield
i
# def f():
# try:
# for i in wrapper():
# yield i
# except Exception as e:
# import traceback
# traceback.print_exc()
return
f
class
MaskLMReader
(
BaseReader
):
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
"""Converts a single `Example` into a single `Record`."""
text_a
=
tokenization
.
convert_to_unicode
(
example
.
text_a
)
tokens_a
=
tokenizer
.
tokenize
(
text_a
)
tokens_b
=
None
has_text_b
=
False
if
isinstance
(
example
,
dict
):
has_text_b
=
"text_b"
in
example
.
keys
()
else
:
has_text_b
=
"text_b"
in
example
.
_fields
if
has_text_b
:
text_b
=
tokenization
.
convert_to_unicode
(
example
.
text_b
)
tokens_b
=
tokenizer
.
tokenize
(
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self
.
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[]
text_type_ids
=
[]
tokens
.
append
(
"[CLS]"
)
text_type_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
0
)
if
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
1
)
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
)
return
record
def
batch_reader
(
examples
,
batch_size
,
in_tokens
,
phase
):
batch
,
total_token_num
,
max_len
=
[],
0
,
0
for
e
in
examples
:
token_ids
,
sent_ids
,
pos_ids
=
_convert_example_to_record
(
e
,
self
.
max_seq_len
,
self
.
tokenizer
)
max_len
=
max
(
max_len
,
len
(
token_ids
))
if
in_tokens
:
to_append
=
(
len
(
batch
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch
)
<
batch_size
if
to_append
:
batch
.
append
(
parsed_line
)
total_token_num
+=
len
(
token_ids
)
else
:
yield
batch
,
total_token_num
batch
,
total_token_num
,
max_len
=
[
parsed_line
],
len
(
token_ids
),
len
(
token_ids
)
if
len
(
batch
)
>
0
and
phase
==
'pred'
:
yield
batch
,
total_token_num
def
data_generator
(
self
,
input_file
,
batch_size
,
epoch
,
dev_count
=
1
,
shuffle
=
True
,
phase
=
None
):
examples
=
self
.
_read_tsv
(
input_file
)
if
phase
is
None
:
phase
=
'all'
self
.
examples
[
phase
]
=
examples
def
wrapper
():
all_dev_batches
=
[]
if
epoch
is
None
:
num_epochs
=
99999999
else
:
num_epochs
=
epoch
for
epoch_index
in
range
(
num_epochs
):
if
phase
==
"train"
:
self
.
current_example
=
0
self
.
current_epoch
=
epoch_index
if
shuffle
:
np
.
random
.
shuffle
(
examples
)
all_dev_batches
=
[]
for
batch_data
,
total_token_num
in
batch_reader
(
examples
,
self
.
batch_size
,
self
.
in_tokens
,
phase
=
phase
):
batch_data
=
prepare_batch_data
(
batch_data
,
total_token_num
,
voc_size
=
self
.
voc_size
,
pad_id
=
self
.
pad_id
,
cls_id
=
self
.
cls_id
,
sep_id
=
self
.
sep_id
,
mask_id
=
self
.
mask_id
,
max_len
=
self
.
max_seq_len
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
if
len
(
all_dev_batches
)
<
dev_count
:
all_dev_batches
.
append
(
batch_data
)
if
len
(
all_dev_batches
)
==
dev_count
:
for
batch
in
all_dev_batches
:
yield
batch
all_dev_batches
=
[]
return
wrapper
class
ClassifyReader
(
BaseReader
):
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
'r'
,
encoding
=
'utf8'
)
as
f
:
reader
=
csv_reader
(
f
)
headers
=
next
(
reader
)
text_indices
=
[
index
for
index
,
h
in
enumerate
(
headers
)
if
h
!=
"label"
]
Example
=
namedtuple
(
'Example'
,
headers
)
examples
=
[]
for
line
in
reader
:
for
index
,
text
in
enumerate
(
line
):
if
index
in
text_indices
:
if
self
.
for_cn
:
line
[
index
]
=
text
.
replace
(
' '
,
''
)
else
:
line
[
index
]
=
text
example
=
Example
(
*
line
)
examples
.
append
(
example
)
return
examples
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
if
not
self
.
is_inference
:
batch_labels
=
[
record
.
label_id
for
record
in
batch_records
]
if
self
.
is_classify
:
batch_labels
=
np
.
array
(
batch_labels
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
elif
self
.
is_regression
:
batch_labels
=
np
.
array
(
batch_labels
).
astype
(
"float32"
).
reshape
(
[
-
1
,
1
])
if
batch_records
[
0
].
qid
:
batch_qids
=
[
record
.
qid
for
record
in
batch_records
]
batch_qids
=
np
.
array
(
batch_qids
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
else
:
batch_qids
=
np
.
array
([]).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
# padding
padded_token_ids
,
input_mask
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_input_mask
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
padded_task_ids
,
input_mask
]
if
not
self
.
is_inference
:
return_list
+=
[
batch_labels
,
batch_qids
]
return
return_list
class
SequenceLabelReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
batch_label_ids
=
[
record
.
label_ids
for
record
in
batch_records
]
# padding
padded_token_ids
,
input_mask
,
batch_seq_lens
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_input_mask
=
True
,
return_seq_lens
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_label_ids
=
pad_batch_data
(
batch_label_ids
,
pad_idx
=
len
(
self
.
label_map
)
-
1
)
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
padded_task_ids
,
input_mask
,
padded_label_ids
,
batch_seq_lens
]
return
return_list
def
_reseg_token_label
(
self
,
tokens
,
labels
,
tokenizer
):
assert
len
(
tokens
)
==
len
(
labels
)
ret_tokens
=
[]
ret_labels
=
[]
for
token
,
label
in
zip
(
tokens
,
labels
):
sub_token
=
tokenizer
.
tokenize
(
token
)
if
len
(
sub_token
)
==
0
:
continue
ret_tokens
.
extend
(
sub_token
)
if
len
(
sub_token
)
==
1
:
ret_labels
.
append
(
label
)
continue
if
label
==
"O"
or
label
.
startswith
(
"I-"
):
ret_labels
.
extend
([
label
]
*
len
(
sub_token
))
elif
label
.
startswith
(
"B-"
):
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
label
]
+
[
i_label
]
*
(
len
(
sub_token
)
-
1
))
elif
label
.
startswith
(
"S-"
):
b_laebl
=
"B-"
+
label
[
2
:]
e_label
=
"E-"
+
label
[
2
:]
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
b_laebl
]
+
[
i_label
]
*
(
len
(
sub_token
)
-
2
)
+
[
e_label
])
elif
label
.
startswith
(
"E-"
):
i_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
i_label
]
*
(
len
(
sub_token
)
-
1
)
+
[
label
])
assert
len
(
ret_tokens
)
==
len
(
ret_labels
)
return
ret_tokens
,
ret_labels
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
tokens
=
tokenization
.
convert_to_unicode
(
example
.
text_a
).
split
(
u
""
)
labels
=
tokenization
.
convert_to_unicode
(
example
.
label
).
split
(
u
""
)
tokens
,
labels
=
self
.
_reseg_token_label
(
tokens
,
labels
,
tokenizer
)
if
len
(
tokens
)
>
max_seq_length
-
2
:
tokens
=
tokens
[
0
:(
max_seq_length
-
2
)]
labels
=
labels
[
0
:(
max_seq_length
-
2
)]
tokens
=
[
"[CLS]"
]
+
tokens
+
[
"[SEP]"
]
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
text_type_ids
=
[
0
]
*
len
(
token_ids
)
no_entity_id
=
len
(
self
.
label_map
)
-
1
label_ids
=
[
no_entity_id
]
+
[
self
.
label_map
[
label
]
for
label
in
labels
]
+
[
no_entity_id
]
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_ids'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
,
label_ids
=
label_ids
)
return
record
class
ExtractEmbeddingReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
# padding
padded_token_ids
,
input_mask
,
seq_lens
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_input_mask
=
True
,
return_seq_lens
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
padded_task_ids
,
input_mask
,
seq_lens
]
return
return_list
class
MRCReader
(
BaseReader
):
def
__init__
(
self
,
vocab_path
,
label_map_config
=
None
,
max_seq_len
=
512
,
do_lower_case
=
True
,
in_tokens
=
False
,
random_seed
=
None
,
tokenizer
=
"FullTokenizer"
,
is_classify
=
True
,
is_regression
=
False
,
for_cn
=
True
,
task_id
=
0
,
doc_stride
=
128
,
max_query_length
=
64
):
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
self
.
vocab
=
self
.
tokenizer
.
vocab
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
in_tokens
=
in_tokens
self
.
for_cn
=
for_cn
self
.
task_id
=
task_id
self
.
doc_stride
=
doc_stride
self
.
max_query_length
=
max_query_length
self
.
examples
=
{}
self
.
features
=
{}
if
random_seed
is
not
None
:
np
.
random
.
seed
(
random_seed
)
self
.
current_example
=
0
self
.
current_epoch
=
0
self
.
num_examples
=
0
self
.
Example
=
namedtuple
(
'Example'
,
[
'qas_id'
,
'question_text'
,
'doc_tokens'
,
'orig_answer_text'
,
'start_position'
,
'end_position'
])
self
.
Feature
=
namedtuple
(
"Feature"
,
[
"unique_id"
,
"example_index"
,
"doc_span_index"
,
"tokens"
,
"token_to_orig_map"
,
"token_is_max_context"
,
"token_ids"
,
"position_ids"
,
"text_type_ids"
,
"start_position"
,
"end_position"
])
self
.
DocSpan
=
namedtuple
(
"DocSpan"
,
[
"start"
,
"length"
])
def
_read_json
(
self
,
input_file
,
is_training
):
examples
=
[]
with
open
(
input_file
,
"r"
,
encoding
=
'utf8'
)
as
f
:
input_data
=
json
.
load
(
f
)[
"data"
]
for
entry
in
input_data
:
for
paragraph
in
entry
[
"paragraphs"
]:
paragraph_text
=
paragraph
[
"context"
]
for
qa
in
paragraph
[
"qas"
]:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_pos
=
None
end_pos
=
None
orig_answer_text
=
None
if
is_training
:
if
len
(
qa
[
"answers"
])
!=
1
:
raise
ValueError
(
"For training, each question should have exactly 1 answer."
)
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
doc_tokens
=
[
paragraph_text
[:
answer_offset
],
paragraph_text
[
answer_offset
:
answer_offset
+
answer_length
],
paragraph_text
[
answer_offset
+
answer_length
:]
]
start_pos
=
1
end_pos
=
1
actual_text
=
" "
.
join
(
doc_tokens
[
start_pos
:(
end_pos
+
1
)])
if
actual_text
.
find
(
orig_answer_text
)
==
-
1
:
log
.
info
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
orig_answer_text
)
continue
else
:
doc_tokens
=
tokenization
.
tokenize_chinese_chars
(
paragraph_text
)
example
=
self
.
Example
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_pos
,
end_position
=
end_pos
)
examples
.
append
(
example
)
return
examples
def
_improve_answer_span
(
self
,
doc_tokens
,
input_start
,
input_end
,
tokenizer
,
orig_answer_text
):
tok_answer_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_answer_text
))
for
new_start
in
range
(
input_start
,
input_end
+
1
):
for
new_end
in
range
(
input_end
,
new_start
-
1
,
-
1
):
text_span
=
" "
.
join
(
doc_tokens
[
new_start
:(
new_end
+
1
)])
if
text_span
==
tok_answer_text
:
return
(
new_start
,
new_end
)
return
(
input_start
,
input_end
)
def
_check_is_max_context
(
self
,
doc_spans
,
cur_span_index
,
position
):
best_score
=
None
best_span_index
=
None
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
end
=
doc_span
.
start
+
doc_span
.
length
-
1
if
position
<
doc_span
.
start
:
continue
if
position
>
end
:
continue
num_left_context
=
position
-
doc_span
.
start
num_right_context
=
end
-
position
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
.
length
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
best_span_index
=
span_index
return
cur_span_index
==
best_span_index
def
_convert_example_to_feature
(
self
,
examples
,
max_seq_length
,
tokenizer
,
is_training
):
features
=
[]
unique_id
=
1000000000
for
(
example_index
,
example
)
in
enumerate
(
examples
):
query_tokens
=
tokenizer
.
tokenize
(
example
.
question_text
)
if
len
(
query_tokens
)
>
self
.
max_query_length
:
query_tokens
=
query_tokens
[
0
:
self
.
max_query_length
]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
tok_start_position
=
None
tok_end_position
=
None
if
is_training
:
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
(
tok_start_position
,
tok_end_position
)
=
self
.
_improve_answer_span
(
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
example
.
orig_answer_text
)
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
doc_spans
=
[]
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
doc_spans
.
append
(
self
.
DocSpan
(
start
=
start_offset
,
length
=
length
))
if
start_offset
+
length
==
len
(
all_doc_tokens
):
break
start_offset
+=
min
(
length
,
self
.
doc_stride
)
for
(
doc_span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
tokens
=
[]
token_to_orig_map
=
{}
token_is_max_context
=
{}
text_type_ids
=
[]
tokens
.
append
(
"[CLS]"
)
text_type_ids
.
append
(
0
)
for
token
in
query_tokens
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
0
)
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
is_max_context
=
self
.
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
text_type_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
1
)
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
start_position
=
None
end_position
=
None
if
is_training
:
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
0
end_position
=
0
else
:
doc_offset
=
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
feature
=
self
.
Feature
(
unique_id
=
unique_id
,
example_index
=
example_index
,
doc_span_index
=
doc_span_index
,
tokens
=
tokens
,
token_to_orig_map
=
token_to_orig_map
,
token_is_max_context
=
token_is_max_context
,
token_ids
=
token_ids
,
position_ids
=
position_ids
,
text_type_ids
=
text_type_ids
,
start_position
=
start_position
,
end_position
=
end_position
)
features
.
append
(
feature
)
unique_id
+=
1
return
features
def
_prepare_batch_data
(
self
,
records
,
batch_size
,
phase
=
None
):
"""generate batch records"""
batch_records
,
max_len
=
[],
0
for
index
,
record
in
enumerate
(
records
):
if
phase
==
"train"
:
self
.
current_example
=
index
max_len
=
max
(
max_len
,
len
(
record
.
token_ids
))
if
self
.
in_tokens
:
to_append
=
(
len
(
batch_records
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch_records
)
<
batch_size
if
to_append
:
batch_records
.
append
(
record
)
else
:
yield
self
.
_pad_batch_records
(
batch_records
,
phase
==
"train"
)
batch_records
,
max_len
=
[
record
],
len
(
record
.
token_ids
)
if
phase
==
'pred'
and
batch_records
:
yield
self
.
_pad_batch_records
(
batch_records
,
phase
==
"train"
)
def
_pad_batch_records
(
self
,
batch_records
,
is_training
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
if
is_training
:
batch_start_position
=
[
record
.
start_position
for
record
in
batch_records
]
batch_end_position
=
[
record
.
end_position
for
record
in
batch_records
]
batch_start_position
=
np
.
array
(
batch_start_position
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
batch_end_position
=
np
.
array
(
batch_end_position
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
else
:
batch_size
=
len
(
batch_token_ids
)
batch_start_position
=
np
.
zeros
(
shape
=
[
batch_size
,
1
],
dtype
=
"int64"
)
batch_end_position
=
np
.
zeros
(
shape
=
[
batch_size
,
1
],
dtype
=
"int64"
)
batch_unique_ids
=
[
record
.
unique_id
for
record
in
batch_records
]
batch_unique_ids
=
np
.
array
(
batch_unique_ids
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
# padding
padded_token_ids
,
input_mask
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
return_input_mask
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
)
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
padded_task_ids
,
input_mask
,
batch_start_position
,
batch_end_position
,
batch_unique_ids
]
return
return_list
def
get_num_examples
(
self
,
phase
):
return
len
(
self
.
features
[
phase
])
def
get_features
(
self
,
phase
):
return
self
.
features
[
phase
]
def
get_examples
(
self
,
phase
):
return
self
.
examples
[
phase
]
def
data_generator
(
self
,
input_file
,
batch_size
,
epoch
,
dev_count
=
1
,
shuffle
=
True
,
phase
=
None
):
examples
=
self
.
examples
.
get
(
phase
,
None
)
features
=
self
.
features
.
get
(
phase
,
None
)
if
not
examples
:
examples
=
self
.
_read_json
(
input_file
,
phase
==
"train"
)
features
=
self
.
_convert_example_to_feature
(
examples
,
self
.
max_seq_len
,
self
.
tokenizer
,
phase
==
"train"
)
self
.
examples
[
phase
]
=
examples
self
.
features
[
phase
]
=
features
def
wrapper
():
all_dev_batches
=
[]
if
epoch
is
None
:
num_epochs
=
99999999
else
:
num_epochs
=
epoch
for
epoch_index
in
range
(
num_epochs
):
if
phase
==
"train"
:
self
.
current_example
=
0
self
.
current_epoch
=
epoch_index
if
phase
==
"train"
and
shuffle
:
np
.
random
.
shuffle
(
features
)
for
batch_data
in
self
.
_prepare_batch_data
(
features
,
batch_size
,
phase
=
phase
):
if
len
(
all_dev_batches
)
<
dev_count
:
all_dev_batches
.
append
(
batch_data
)
if
len
(
all_dev_batches
)
==
dev_count
:
for
batch
in
all_dev_batches
:
yield
batch
all_dev_batches
=
[]
return
wrapper
if
__name__
==
'__main__'
:
pass
build/lib/paddlepalm/task_instance.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddlepalm.interface
import
reader
as
base_reader
from
paddlepalm.interface
import
task_paradigm
as
base_paradigm
import
os
import
json
from
paddle
import
fluid
class
TaskInstance
(
object
):
def
__init__
(
self
,
name
,
id
,
config
=
{},
verbose
=
True
):
self
.
_name
=
name
self
.
_config
=
config
self
.
_verbose
=
verbose
self
.
_save_infermodel_path
=
os
.
path
.
join
(
self
.
_config
[
'save_path'
],
'infer_model'
)
self
.
_save_ckpt_path
=
os
.
path
.
join
(
self
.
_config
[
'save_path'
],
'ckpt'
)
# following flags can be fetch from instance config file
self
.
_is_target
=
config
.
get
(
'is_target'
,
True
)
self
.
_first_target
=
config
.
get
(
'is_first_target'
,
False
)
self
.
_task_reuse_scope
=
config
.
get
(
'task_reuse_scope'
,
name
)
self
.
_feeded_var_names
=
None
self
.
_target_vars
=
None
# training process management
self
.
_mix_ratio
=
None
self
.
_expected_train_steps
=
None
self
.
_expected_train_epochs
=
None
self
.
_steps_pur_epoch
=
None
self
.
_cur_train_epoch
=
0
self
.
_cur_train_step
=
0
self
.
_train_finish
=
False
# 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例
self
.
_reader
=
{
'train'
:
None
,
'eval'
:
None
,
'pred'
:
None
}
self
.
_input_layer
=
None
self
.
_inputname_to_varname
=
{}
self
.
_task_layer
=
{
'train'
:
None
,
'eval'
:
None
,
'pred'
:
None
}
self
.
_pred_input_name_list
=
[]
self
.
_pred_input_varname_list
=
[]
self
.
_pred_fetch_name_list
=
[]
self
.
_pred_fetch_var_list
=
[]
self
.
_Reader
=
None
self
.
_Paradigm
=
None
self
.
_exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
self
.
_save_protocol
=
{
'input_names'
:
'self._pred_input_name_list'
,
'input_varnames'
:
'self._pred_input_varname_list'
,
'fetch_list'
:
'self._pred_fetch_name_list'
}
def
build_task_layer
(
self
,
net_inputs
,
phase
):
output_vars
=
self
.
_task_layer
[
phase
].
build
(
net_inputs
)
if
phase
==
'pred'
:
self
.
_pred_fetch_name_list
,
self
.
_pred_fetch_var_list
=
zip
(
*
output_vars
.
items
())
return
output_vars
def
postprocess
(
self
,
rt_outputs
,
phase
):
return
self
.
_task_layer
[
phase
].
postprocess
(
rt_outputs
)
def
epoch_postprocess
(
self
,
epoch_inputs
,
phase
):
return
self
.
_task_layer
[
phase
].
epoch_postprocess
(
epoch_inputs
)
def
save
(
self
,
suffix
=
''
):
dirpath
=
self
.
_save_infermodel_path
+
suffix
self
.
_pred_input_varname_list
=
[
str
(
i
)
for
i
in
self
.
_pred_input_varname_list
]
fluid
.
io
.
save_inference_model
(
dirpath
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
,
self
.
_exe
)
# fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe, params_filename='__params__')
print
(
self
.
_name
+
': inference model saved at '
+
dirpath
)
conf
=
{}
for
k
,
strv
in
self
.
_save_protocol
.
items
():
exec
(
'v={}'
.
format
(
strv
))
conf
[
k
]
=
v
with
open
(
os
.
path
.
join
(
dirpath
,
'__conf__'
),
'w'
)
as
writer
:
writer
.
write
(
json
.
dumps
(
conf
,
indent
=
1
))
def
load
(
self
,
infer_model_path
=
None
):
if
infer_model_path
is
None
:
infer_model_path
=
self
.
_save_infermodel_path
for
k
,
v
in
json
.
load
(
open
(
os
.
path
.
join
(
infer_model_path
,
'__conf__'
))).
items
():
strv
=
self
.
_save_protocol
[
k
]
exec
(
'{}=v'
.
format
(
strv
))
pred_prog
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
=
\
fluid
.
io
.
load_inference_model
(
infer_model_path
,
self
.
_exe
)
# pred_prog, self._pred_input_varname_list, self._pred_fetch_var_list = \
# fluid.io.load_inference_model(infer_model_path, self._exe, params_filename='__params__')
print
(
self
.
_name
+
': inference model loaded from '
+
infer_model_path
)
return
pred_prog
@
property
def
name
(
self
):
return
self
.
_name
@
property
def
Reader
(
self
):
return
self
.
_Reader
@
Reader
.
setter
def
Reader
(
self
,
cls
):
assert
base_reader
.
__name__
==
cls
.
__bases__
[
-
1
].
__name__
,
\
"expect: {}, receive: {}."
.
format
(
base_reader
.
__name__
,
\
cls
.
__bases__
[
-
1
].
__name__
)
self
.
_Reader
=
cls
@
property
def
Paradigm
(
self
):
return
self
.
_Paradigm
@
Paradigm
.
setter
def
Paradigm
(
self
,
cls
):
assert
base_paradigm
.
__name__
==
cls
.
__bases__
[
-
1
].
__name__
,
\
"expect: {}, receive: {}."
.
format
(
base_paradigm
.
__name__
,
\
cls
.
__bases__
[
-
1
].
__name__
)
self
.
_Paradigm
=
cls
@
property
def
config
(
self
):
return
self
.
_config
@
property
def
reader
(
self
):
return
self
.
_reader
@
property
def
pred_input
(
self
):
return
zip
(
*
[
self
.
_pred_input_name_list
,
self
.
_pred_input_varname_list
])
@
pred_input
.
setter
def
pred_input
(
self
,
val
):
assert
isinstance
(
val
,
dict
)
self
.
_pred_input_name_list
,
self
.
_pred_input_varname_list
=
\
zip
(
*
[[
k
,
v
.
name
]
for
k
,
v
in
val
.
items
()])
# print(self._pred_input_name_list)
@
property
def
pred_fetch_list
(
self
):
return
[
self
.
_pred_fetch_name_list
,
self
.
_pred_fetch_var_list
]
@
property
def
task_layer
(
self
):
return
self
.
_task_layer
@
property
def
is_first_target
(
self
):
return
self
.
_is_first_target
@
is_first_target
.
setter
def
is_first_target
(
self
,
value
):
self
.
_is_first_target
=
bool
(
value
)
if
self
.
_is_first_target
:
assert
self
.
_is_target
,
"ERROR: only target task could be set as main task."
if
self
.
_verbose
and
self
.
_is_first_target
:
print
(
"{}: set as main task"
.
format
(
self
.
_name
))
@
property
def
is_target
(
self
):
if
self
.
_is_target
is
not
None
:
return
self
.
_is_target
else
:
raise
ValueError
(
"{}: is_target is None"
.
format
(
self
.
_name
))
@
is_target
.
setter
def
is_target
(
self
,
value
):
self
.
_is_target
=
bool
(
value
)
if
self
.
_verbose
:
if
self
.
_is_target
:
print
(
'{}: set as target task.'
.
format
(
self
.
_name
))
else
:
print
(
'{}: set as aux task.'
.
format
(
self
.
_name
))
@
property
def
mix_ratio
(
self
):
if
self
.
_mix_ratio
is
not
None
:
return
self
.
_mix_ratio
else
:
raise
ValueError
(
"{}: mix_ratio is None"
.
format
(
self
.
_name
))
@
mix_ratio
.
setter
def
mix_ratio
(
self
,
value
):
self
.
_mix_ratio
=
float
(
value
)
if
self
.
_verbose
:
print
(
'{}: mix_ratio is set to {}'
.
format
(
self
.
_name
,
self
.
_mix_ratio
))
@
property
def
expected_train_steps
(
self
):
return
self
.
_expected_train_steps
@
expected_train_steps
.
setter
def
expected_train_steps
(
self
,
value
):
self
.
_expected_train_steps
=
value
self
.
_expected_train_epochs
=
value
/
float
(
self
.
_steps_pur_epoch
)
@
property
def
expected_train_epochs
(
self
):
return
self
.
_expected_train_epochs
@
property
def
cur_train_epoch
(
self
):
return
self
.
_cur_train_epoch
@
cur_train_epoch
.
setter
def
cur_train_epoch
(
self
,
value
):
self
.
_cur_train_epoch
=
value
@
property
def
cur_train_step
(
self
):
return
self
.
_cur_train_step
@
cur_train_step
.
setter
def
cur_train_step
(
self
,
value
):
self
.
_cur_train_step
=
value
if
self
.
_cur_train_step
>
self
.
_steps_pur_epoch
:
self
.
_cur_train_epoch
+=
1
self
.
_cur_train_step
=
1
if
self
.
_is_target
and
self
.
_cur_train_step
+
self
.
_cur_train_epoch
*
self
.
_steps_pur_epoch
>=
self
.
_expected_train_steps
:
self
.
_train_finish
=
True
print
(
self
.
_name
+
': train finished!'
)
self
.
save
()
# fluid.io.save_inference_model(self._save_infermodel_path, )
@
property
def
steps_pur_epoch
(
self
):
return
self
.
_steps_pur_epoch
@
steps_pur_epoch
.
setter
def
steps_pur_epoch
(
self
,
value
):
self
.
_steps_pur_epoch
=
value
@
property
def
train_finish
(
self
):
return
self
.
_train_finish
@
property
def
task_reuse_scope
(
self
):
if
self
.
_task_reuse_scope
is
not
None
:
return
self
.
_task_reuse_scope
else
:
raise
ValueError
(
"{}: task_reuse_scope is None"
.
format
(
self
.
_name
))
@
task_reuse_scope
.
setter
def
task_reuse_scope
(
self
,
scope_name
):
self
.
_task_reuse_scope
=
str
(
scope_name
)
if
self
.
_verbose
:
print
(
'{}: task_reuse_scope is set to {}'
.
format
(
self
.
_name
,
self
.
_task_reuse_scope
))
def
check_instances
(
insts
):
"""to check ids, first_target"""
pass
def
_check_ids
():
pass
def
_check_targets
():
pass
def
_check_reuse_scopes
():
pass
build/lib/paddlepalm/task_paradigm/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/task_paradigm/cls.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
from
paddlepalm.interface
import
task_paradigm
from
paddle.fluid
import
layers
class
TaskParadigm
(
task_paradigm
):
'''
classification
'''
def
__init___
(
self
,
config
,
phase
):
self
.
_is_training
=
phase
==
'train'
self
.
sent_emb_size
=
config
[
'hidden_size'
]
self
.
num_classes
=
config
[
'n_classes'
]
@
property
def
inputs_attrs
(
self
):
return
{
'bakcbone'
:
{
"sentence_emb"
:
[
-
1
,
self
.
sent_emb_size
],
'float32'
]},
'reader'
:
{
"label_ids"
:
[[
-
1
,
1
],
'int64'
]}}
@
property
def
outputs_attrs
(
self
):
if
self
.
_is_training
:
return
{
'loss'
:
[[
1
],
'float32'
]}
else
:
return
{
'logits'
:
[
-
1
,
self
.
num_classes
],
'float32'
}
def
build
(
self
,
**
inputs
):
sent_emb
=
inputs
[
'backbone'
][
'sentence_emb'
]
label_ids
=
inputs
[
'reader'
][
'label_ids'
]
logits
=
fluid
.
layers
.
fc
(
input
=
ent_emb
size
=
self
.
num_classes
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
label_ids
)
loss
=
layers
.
mean
(
loss
)
if
self
.
_is_training
:
return
{
"loss"
:
loss
}
else
:
return
{
"logits"
:
logits
}
build/lib/paddlepalm/task_paradigm/match.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
from
paddlepalm.interface
import
task_paradigm
from
paddle.fluid
import
layers
class
TaskParadigm
(
task_paradigm
):
'''
matching
'''
def
__init__
(
self
,
config
,
phase
,
backbone_config
=
None
):
self
.
_is_training
=
phase
==
'train'
self
.
_hidden_size
=
backbone_config
[
'hidden_size'
]
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"label_ids"
:
[[
-
1
,
1
],
'int64'
]}
else
:
reader
=
{}
bb
=
{
"sentence_pair_embedding"
:
[[
-
1
,
self
.
_hidden_size
],
'float32'
]}
return
{
'reader'
:
reader
,
'backbone'
:
bb
}
@
property
def
outputs_attrs
(
self
):
if
self
.
_is_training
:
return
{
"loss"
:
[[
1
],
'float32'
]}
else
:
return
{
"logits"
:
[[
-
1
,
1
],
'float32'
]}
def
build
(
self
,
inputs
):
if
self
.
_is_training
:
labels
=
inputs
[
"reader"
][
"label_ids"
]
cls_feats
=
inputs
[
"backbone"
][
"sentence_pair_embedding"
]
cls_feats
=
fluid
.
layers
.
dropout
(
x
=
cls_feats
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
logits
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
if
self
.
_is_training
:
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
labels
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
return
{
'loss'
:
loss
}
else
:
return
{
'logits'
:
logits
}
build/lib/paddlepalm/task_paradigm/mlm.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
from
paddlepalm.interface
import
task_paradigm
from
paddle.fluid
import
layers
class
TaskParadigm
(
task_paradigm
):
'''
matching
'''
def
__init__
(
self
,
config
,
phase
,
backbone_config
=
None
):
self
.
_is_training
=
phase
==
'train'
self
.
_hidden_size
=
backbone_config
[
'hidden_size'
]
self
.
_vocab_size
=
backbone_config
[
'vocab_size'
]
self
.
_hidden_act
=
backbone_config
[
'hidden_act'
]
self
.
_initializer_range
=
backbone_config
[
'initializer_range'
]
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"label_ids"
:
[[
-
1
,
1
],
'int64'
]}
else
:
reader
=
{}
bb
=
{
"encoder_outputs"
:
[[
-
1
,
self
.
_hidden_size
],
'float32'
]}
return
{
'reader'
:
reader
,
'backbone'
:
bb
}
@
property
def
outputs_attrs
(
self
):
if
self
.
_is_training
:
return
{
"loss"
:
[[
1
],
'float32'
]}
else
:
return
{
"logits"
:
[[
-
1
,
1
],
'float32'
]}
def
build
(
self
,
inputs
):
mask_label
=
inputs
[
"reader"
][
"mask_label"
]
mask_pos
=
inputs
[
"reader"
][
"mask_pos"
]
word_emb
=
inputs
[
"backbone"
][
"word_embedding"
]
enc_out
=
inputs
[
"backbone"
][
"encoder_outputs"
]
emb_size
=
word_emb
.
shape
[
-
1
]
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
self
.
_initializer_range
)
mask_pos
=
fluid
.
layers
.
cast
(
x
=
mask_pos
,
dtype
=
'int32'
)
reshaped_emb_out
=
fluid
.
layers
.
reshape
(
x
=
enc_out
,
shape
=
[
-
1
,
emb_size
])
# extract masked tokens' feature
mask_feat
=
fluid
.
layers
.
gather
(
input
=
reshaped_emb_out
,
index
=
mask_pos
)
num_seqs
=
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
value
=
512
,
dtype
=
'int64'
)
# transform: fc
mask_trans_feat
=
fluid
.
layers
.
fc
(
input
=
mask_feat
,
size
=
emb_size
,
act
=
self
.
_hidden_act
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'mask_lm_trans_fc.w_0'
,
initializer
=
_param_initializer
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'mask_lm_trans_fc.b_0'
))
# transform: layer norm
mask_trans_feat
=
pre_process_layer
(
mask_trans_feat
,
'n'
,
name
=
'mask_lm_trans'
)
mask_lm_out_bias_attr
=
fluid
.
ParamAttr
(
name
=
"mask_lm_out_fc.b_0"
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
# print fluid.default_main_program().global_block()
# fc_out = fluid.layers.matmul(
# x=mask_trans_feat,
# y=fluid.default_main_program().global_block().var(
# _word_emb_name),
# transpose_y=True)
fc_out
=
fluid
.
layers
.
matmul
(
x
=
mask_trans_feat
,
y
=
word_emb
,
transpose_y
=
True
)
fc_out
+=
fluid
.
layers
.
create_parameter
(
shape
=
[
self
.
_vocab_size
],
dtype
=
'float32'
,
attr
=
mask_lm_out_bias_attr
,
is_bias
=
True
)
mask_lm_loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc_out
,
label
=
mask_label
)
loss
=
fluid
.
layers
.
mean
(
mask_lm_loss
)
if
self
.
_is_training
:
return
{
'loss'
:
loss
}
else
:
return
None
build/lib/paddlepalm/task_paradigm/mrc.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
from
paddlepalm.interface
import
task_paradigm
import
collections
import
numpy
as
np
import
os
import
math
import
six
import
paddlepalm.tokenizer.ernie_tokenizer
as
tokenization
import
json
RawResult
=
collections
.
namedtuple
(
"RawResult"
,
[
"unique_id"
,
"start_logits"
,
"end_logits"
])
class
TaskParadigm
(
task_paradigm
):
""""""
def
__init__
(
self
,
config
,
phase
,
backbone_config
=
None
):
self
.
_is_training
=
phase
==
'train'
self
.
_max_sequence_length
=
config
[
'max_seq_len'
]
self
.
_hidden_size
=
backbone_config
[
'hidden_size'
]
self
.
_pred_results
=
[]
if
phase
==
'pred'
:
self
.
_max_answer_length
=
config
.
get
(
'max_answer_len'
,
None
)
self
.
_null_score_diff_threshold
=
config
.
get
(
'null_score_diff_threshold'
,
0.0
)
self
.
_n_best_size
=
config
.
get
(
'n_best_size'
,
20
)
self
.
_pred_output_path
=
config
.
get
(
'pred_output_path'
,
None
)
self
.
_verbose
=
config
.
get
(
'verbose'
,
False
)
self
.
_with_negative
=
config
.
get
(
'with_negative'
,
False
)
self
.
_do_lower_case
=
config
.
get
(
'do_lower_case'
,
False
)
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"start_positions"
:
[[
-
1
,
1
],
'int64'
],
"end_positions"
:
[[
-
1
,
1
],
'int64'
]}
else
:
reader
=
{
'unique_ids'
:
[[
-
1
,
1
],
'int64'
]}
bb
=
{
"encoder_outputs"
:
[[
-
1
,
-
1
,
self
.
_hidden_size
],
'float32'
]}
return
{
'reader'
:
reader
,
'backbone'
:
bb
}
@
property
def
epoch_inputs_attrs
(
self
):
if
not
self
.
_is_training
:
from_reader
=
{
'examples'
:
None
,
'features'
:
None
}
return
{
'reader'
:
from_reader
}
@
property
def
outputs_attr
(
self
):
if
self
.
_is_training
:
return
{
'loss'
:
[[
1
],
'float32'
]}
else
:
return
{
'start_logits'
:
[[
-
1
,
-
1
,
1
],
'float32'
],
'end_logits'
:
[[
-
1
,
-
1
,
1
],
'float32'
],
'unique_ids'
:
[[
-
1
,
1
],
'int64'
]}
def
build
(
self
,
inputs
):
if
self
.
_is_training
:
start_positions
=
inputs
[
'reader'
][
'start_positions'
]
end_positions
=
inputs
[
'reader'
][
'end_positions'
]
else
:
unique_id
=
inputs
[
'reader'
][
'unique_ids'
]
enc_out
=
inputs
[
'backbone'
][
'encoder_outputs'
]
logits
=
fluid
.
layers
.
fc
(
input
=
enc_out
,
size
=
2
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_squad_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_squad_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
logits
=
fluid
.
layers
.
transpose
(
x
=
logits
,
perm
=
[
2
,
0
,
1
])
start_logits
,
end_logits
=
fluid
.
layers
.
unstack
(
x
=
logits
,
axis
=
0
)
def
_compute_single_loss
(
logits
,
positions
):
"""Compute start/end loss for mrc model"""
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
positions
)
loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
return
loss
if
self
.
_is_training
:
start_loss
=
_compute_single_loss
(
start_logits
,
start_positions
)
end_loss
=
_compute_single_loss
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2.0
return
{
'loss'
:
total_loss
}
else
:
return
{
'start_logits'
:
start_logits
,
'end_logits'
:
end_logits
,
'unique_ids'
:
unique_id
}
def
postprocess
(
self
,
rt_outputs
):
"""this func will be called after each step(batch) of training/evaluating/predicting process."""
if
not
self
.
_is_training
:
unique_ids
=
np
.
squeeze
(
rt_outputs
[
'unique_ids'
],
-
1
)
start_logits
=
rt_outputs
[
'start_logits'
]
end_logits
=
rt_outputs
[
'end_logits'
]
for
idx
in
range
(
len
(
unique_ids
)):
if
unique_ids
[
idx
]
<
0
:
continue
if
len
(
self
.
_pred_results
)
%
1000
==
0
:
print
(
"Predicting example: {}"
.
format
(
len
(
self
.
_pred_results
)))
uid
=
int
(
unique_ids
[
idx
])
s
=
[
float
(
x
)
for
x
in
start_logits
[
idx
].
flat
]
e
=
[
float
(
x
)
for
x
in
end_logits
[
idx
].
flat
]
self
.
_pred_results
.
append
(
RawResult
(
unique_id
=
uid
,
start_logits
=
s
,
end_logits
=
e
))
def
epoch_postprocess
(
self
,
post_inputs
):
"""(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
if
not
self
.
_is_training
:
if
self
.
_pred_output_path
is
None
:
raise
ValueError
(
'argument pred_output_path not found in config. Please add it into config dict/file.'
)
examples
=
post_inputs
[
'reader'
][
'examples'
]
features
=
post_inputs
[
'reader'
][
'features'
]
if
not
os
.
path
.
exists
(
self
.
_pred_output_path
):
os
.
makedirs
(
self
.
_pred_output_path
)
output_prediction_file
=
os
.
path
.
join
(
self
.
_pred_output_path
,
"predictions.json"
)
output_nbest_file
=
os
.
path
.
join
(
self
.
_pred_output_path
,
"nbest_predictions.json"
)
output_null_log_odds_file
=
os
.
path
.
join
(
self
.
_pred_output_path
,
"null_odds.json"
)
_write_predictions
(
examples
,
features
,
self
.
_pred_results
,
self
.
_n_best_size
,
self
.
_max_answer_length
,
self
.
_do_lower_case
,
output_prediction_file
,
output_nbest_file
,
output_null_log_odds_file
,
self
.
_with_negative
,
self
.
_null_score_diff_threshold
,
self
.
_verbose
)
def
_write_predictions
(
all_examples
,
all_features
,
all_results
,
n_best_size
,
max_answer_length
,
do_lower_case
,
output_prediction_file
,
output_nbest_file
,
output_null_log_odds_file
,
with_negative
,
null_score_diff_threshold
,
verbose
):
"""Write final predictions to the json file and log-odds of null if needed."""
print
(
"Writing predictions to: %s"
%
(
output_prediction_file
))
print
(
"Writing nbest to: %s"
%
(
output_nbest_file
))
example_index_to_features
=
collections
.
defaultdict
(
list
)
for
feature
in
all_features
:
example_index_to_features
[
feature
.
example_index
].
append
(
feature
)
unique_id_to_result
=
{}
for
result
in
all_results
:
unique_id_to_result
[
result
.
unique_id
]
=
result
_PrelimPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"PrelimPrediction"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
"start_logit"
,
"end_logit"
])
all_predictions
=
collections
.
OrderedDict
()
all_nbest_json
=
collections
.
OrderedDict
()
scores_diff_json
=
collections
.
OrderedDict
()
for
(
example_index
,
example
)
in
enumerate
(
all_examples
):
features
=
example_index_to_features
[
example_index
]
prelim_predictions
=
[]
# keep track of the minimum score of null start+end of position 0
score_null
=
1000000
# large and positive
min_null_feature_index
=
0
# the paragraph slice with min mull score
null_start_logit
=
0
# the start logit at the slice with min null score
null_end_logit
=
0
# the end logit at the slice with min null score
for
(
feature_index
,
feature
)
in
enumerate
(
features
):
result
=
unique_id_to_result
[
feature
.
unique_id
]
start_indexes
=
_get_best_indexes
(
result
.
start_logits
,
n_best_size
)
end_indexes
=
_get_best_indexes
(
result
.
end_logits
,
n_best_size
)
# if we could have irrelevant answers, get the min score of irrelevant
if
with_negative
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
feature_null_score
<
score_null
:
score_null
=
feature_null_score
min_null_feature_index
=
feature_index
null_start_logit
=
result
.
start_logits
[
0
]
null_end_logit
=
result
.
end_logits
[
0
]
for
start_index
in
start_indexes
:
for
end_index
in
end_indexes
:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
>=
len
(
feature
.
tokens
):
continue
if
end_index
>=
len
(
feature
.
tokens
):
continue
if
start_index
not
in
feature
.
token_to_orig_map
:
continue
if
end_index
not
in
feature
.
token_to_orig_map
:
continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
,
end_index
=
end_index
,
start_logit
=
result
.
start_logits
[
start_index
],
end_logit
=
result
.
end_logits
[
end_index
]))
if
with_negative
:
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
min_null_feature_index
,
start_index
=
0
,
end_index
=
0
,
start_logit
=
null_start_logit
,
end_logit
=
null_end_logit
))
prelim_predictions
=
sorted
(
prelim_predictions
,
key
=
lambda
x
:
(
x
.
start_logit
+
x
.
end_logit
),
reverse
=
True
)
_NbestPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"NbestPrediction"
,
[
"text"
,
"start_logit"
,
"end_logit"
])
seen_predictions
=
{}
nbest
=
[]
for
pred
in
prelim_predictions
:
if
len
(
nbest
)
>=
n_best_size
:
break
feature
=
features
[
pred
.
feature_index
]
if
pred
.
start_index
>
0
:
# this is a non-null prediction
tok_tokens
=
feature
.
tokens
[
pred
.
start_index
:(
pred
.
end_index
+
1
)]
orig_doc_start
=
feature
.
token_to_orig_map
[
pred
.
start_index
]
orig_doc_end
=
feature
.
token_to_orig_map
[
pred
.
end_index
]
orig_tokens
=
example
.
doc_tokens
[
orig_doc_start
:(
orig_doc_end
+
1
)]
tok_text
=
" "
.
join
(
tok_tokens
)
# De-tokenize WordPieces that have been split off.
tok_text
=
tok_text
.
replace
(
" ##"
,
""
)
tok_text
=
tok_text
.
replace
(
"##"
,
""
)
# Clean whitespace
tok_text
=
tok_text
.
strip
()
tok_text
=
" "
.
join
(
tok_text
.
split
())
orig_text
=
" "
.
join
(
orig_tokens
)
final_text
=
_get_final_text
(
tok_text
,
orig_text
,
do_lower_case
,
verbose
)
if
final_text
in
seen_predictions
:
continue
seen_predictions
[
final_text
]
=
True
else
:
final_text
=
""
seen_predictions
[
final_text
]
=
True
nbest
.
append
(
_NbestPrediction
(
text
=
final_text
,
start_logit
=
pred
.
start_logit
,
end_logit
=
pred
.
end_logit
))
# if we didn't inlude the empty option in the n-best, inlcude it
if
with_negative
:
if
""
not
in
seen_predictions
:
nbest
.
append
(
_NbestPrediction
(
text
=
""
,
start_logit
=
null_start_logit
,
end_logit
=
null_end_logit
))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if
not
nbest
:
nbest
.
append
(
_NbestPrediction
(
text
=
"empty"
,
start_logit
=
0.0
,
end_logit
=
0.0
))
assert
len
(
nbest
)
>=
1
total_scores
=
[]
best_non_null_entry
=
None
for
entry
in
nbest
:
total_scores
.
append
(
entry
.
start_logit
+
entry
.
end_logit
)
if
not
best_non_null_entry
:
if
entry
.
text
:
best_non_null_entry
=
entry
# debug
if
best_non_null_entry
is
None
:
print
(
"Emmm..., sth wrong"
)
probs
=
_compute_softmax
(
total_scores
)
nbest_json
=
[]
for
(
i
,
entry
)
in
enumerate
(
nbest
):
output
=
collections
.
OrderedDict
()
output
[
"text"
]
=
entry
.
text
output
[
"probability"
]
=
probs
[
i
]
output
[
"start_logit"
]
=
entry
.
start_logit
output
[
"end_logit"
]
=
entry
.
end_logit
nbest_json
.
append
(
output
)
assert
len
(
nbest_json
)
>=
1
if
not
with_negative
:
all_predictions
[
example
.
qas_id
]
=
nbest_json
[
0
][
"text"
]
else
:
# predict "" iff the null score - the score of best non-null > threshold
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
all_nbest_json
[
example
.
qas_id
]
=
nbest_json
with
open
(
output_prediction_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
all_predictions
,
indent
=
4
)
+
"
\n
"
)
with
open
(
output_nbest_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
all_nbest_json
,
indent
=
4
)
+
"
\n
"
)
if
with_negative
:
with
open
(
output_null_log_odds_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
scores_diff_json
,
indent
=
4
)
+
"
\n
"
)
def
_get_final_text
(
pred_text
,
orig_text
,
do_lower_case
,
verbose
):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the MRQA eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def
_strip_spaces
(
text
):
ns_chars
=
[]
ns_to_s_map
=
collections
.
OrderedDict
()
for
(
i
,
c
)
in
enumerate
(
text
):
if
c
==
" "
:
continue
ns_to_s_map
[
len
(
ns_chars
)]
=
i
ns_chars
.
append
(
c
)
ns_text
=
""
.
join
(
ns_chars
)
return
(
ns_text
,
ns_to_s_map
)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer
=
tokenization
.
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
start_position
=
tok_text
.
find
(
pred_text
)
if
start_position
==
-
1
:
if
verbose
:
print
(
"Unable to find text: '%s' in '%s'"
%
(
pred_text
,
orig_text
))
return
orig_text
end_position
=
start_position
+
len
(
pred_text
)
-
1
(
orig_ns_text
,
orig_ns_to_s_map
)
=
_strip_spaces
(
orig_text
)
(
tok_ns_text
,
tok_ns_to_s_map
)
=
_strip_spaces
(
tok_text
)
if
len
(
orig_ns_text
)
!=
len
(
tok_ns_text
):
if
verbose
:
print
(
"Length not equal after stripping spaces: '%s' vs '%s'"
,
orig_ns_text
,
tok_ns_text
)
return
orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map
=
{}
for
(
i
,
tok_index
)
in
six
.
iteritems
(
tok_ns_to_s_map
):
tok_s_to_ns_map
[
tok_index
]
=
i
orig_start_position
=
None
if
start_position
in
tok_s_to_ns_map
:
ns_start_position
=
tok_s_to_ns_map
[
start_position
]
if
ns_start_position
in
orig_ns_to_s_map
:
orig_start_position
=
orig_ns_to_s_map
[
ns_start_position
]
if
orig_start_position
is
None
:
if
verbose
:
print
(
"Couldn't map start position"
)
return
orig_text
orig_end_position
=
None
if
end_position
in
tok_s_to_ns_map
:
ns_end_position
=
tok_s_to_ns_map
[
end_position
]
if
ns_end_position
in
orig_ns_to_s_map
:
orig_end_position
=
orig_ns_to_s_map
[
ns_end_position
]
if
orig_end_position
is
None
:
if
verbose
:
print
(
"Couldn't map end position"
)
return
orig_text
output_text
=
orig_text
[
orig_start_position
:(
orig_end_position
+
1
)]
return
output_text
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_compute_softmax
(
scores
):
"""Compute softmax probability over raw logits."""
if
not
scores
:
return
[]
max_score
=
None
for
score
in
scores
:
if
max_score
is
None
or
score
>
max_score
:
max_score
=
score
exp_scores
=
[]
total_sum
=
0.0
for
score
in
scores
:
x
=
math
.
exp
(
score
-
max_score
)
exp_scores
.
append
(
x
)
total_sum
+=
x
probs
=
[]
for
score
in
exp_scores
:
probs
.
append
(
score
/
total_sum
)
return
probs
build/lib/paddlepalm/tokenizer/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/tokenizer/bert_tokenizer.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
unicodedata
import
six
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
elif
isinstance
(
text
,
unicode
):
return
text
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
fin
=
open
(
vocab_file
)
for
num
,
line
in
enumerate
(
fin
):
items
=
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
break
token
=
items
[
0
]
index
=
items
[
1
]
if
len
(
items
)
==
2
else
num
token
=
token
.
strip
()
vocab
[
token
]
=
int
(
index
)
return
vocab
def
convert_by_vocab
(
vocab
,
items
):
"""Converts a sequence of [tokens|ids] using the vocab."""
output
=
[]
for
item
in
items
:
output
.
append
(
vocab
[
item
])
return
output
def
convert_tokens_to_ids
(
vocab
,
tokens
):
return
convert_by_vocab
(
vocab
,
tokens
)
def
convert_ids_to_tokens
(
inv_vocab
,
ids
):
return
convert_by_vocab
(
inv_vocab
,
ids
)
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
FullTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
CharTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
text
.
lower
().
split
(
" "
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self
.
do_lower_case
=
do_lower_case
self
.
_never_lowercase
=
[
'[UNK]'
,
'[SEP]'
,
'[PAD]'
,
'[CLS]'
,
'[MASK]'
]
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
and
token
not
in
self
.
_never_lowercase
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
if
token
in
self
.
_never_lowercase
:
split_tokens
.
extend
([
token
])
else
:
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
if
len
(
chars
)
>
self
.
max_input_chars_per_word
:
output_tokens
.
append
(
self
.
unk_token
)
continue
is_bad
=
False
start
=
0
sub_tokens
=
[]
while
start
<
len
(
chars
):
end
=
len
(
chars
)
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
break
end
-=
1
if
cur_substr
is
None
:
is_bad
=
True
break
sub_tokens
.
append
(
cur_substr
)
start
=
end
if
is_bad
:
output_tokens
.
append
(
self
.
unk_token
)
else
:
output_tokens
.
extend
(
sub_tokens
)
return
output_tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"C"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
cp
>=
33
and
cp
<=
47
)
or
(
cp
>=
58
and
cp
<=
64
)
or
(
cp
>=
91
and
cp
<=
96
)
or
(
cp
>=
123
and
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
build/lib/paddlepalm/tokenizer/ernie_tokenizer.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
unicode_literals
from
__future__
import
absolute_import
from
io
import
open
import
collections
import
unicodedata
import
six
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
elif
isinstance
(
text
,
unicode
):
return
text
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
with
open
(
vocab_file
,
encoding
=
'utf8'
)
as
fin
:
for
num
,
line
in
enumerate
(
fin
):
items
=
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
break
token
=
items
[
0
]
index
=
items
[
1
]
if
len
(
items
)
==
2
else
num
token
=
token
.
strip
()
vocab
[
token
]
=
int
(
index
)
return
vocab
def
convert_by_vocab
(
vocab
,
items
):
"""Converts a sequence of [tokens|ids] using the vocab."""
output
=
[]
for
item
in
items
:
output
.
append
(
vocab
[
item
])
return
output
def
convert_tokens_to_ids
(
vocab
,
tokens
):
return
convert_by_vocab
(
vocab
,
tokens
)
def
convert_ids_to_tokens
(
inv_vocab
,
ids
):
return
convert_by_vocab
(
inv_vocab
,
ids
)
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
FullTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
CharTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
text
.
lower
().
split
(
" "
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self
.
do_lower_case
=
do_lower_case
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
if
len
(
chars
)
>
self
.
max_input_chars_per_word
:
output_tokens
.
append
(
self
.
unk_token
)
continue
is_bad
=
False
start
=
0
sub_tokens
=
[]
while
start
<
len
(
chars
):
end
=
len
(
chars
)
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
break
end
-=
1
if
cur_substr
is
None
:
is_bad
=
True
break
sub_tokens
.
append
(
cur_substr
)
start
=
end
if
is_bad
:
output_tokens
.
append
(
self
.
unk_token
)
else
:
output_tokens
.
extend
(
sub_tokens
)
return
output_tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"C"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
cp
>=
33
and
cp
<=
47
)
or
(
cp
>=
58
and
cp
<=
64
)
or
(
cp
>=
91
and
cp
<=
96
)
or
(
cp
>=
123
and
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
def
tokenize_chinese_chars
(
text
):
"""Adds whitespace around any CJK character."""
def
_is_chinese_char
(
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
output
=
[]
buff
=
""
for
char
in
text
:
cp
=
ord
(
char
)
if
_is_chinese_char
(
cp
)
or
_is_whitespace
(
char
):
if
buff
!=
""
:
output
.
append
(
buff
)
buff
=
""
output
.
append
(
char
)
else
:
buff
+=
char
if
buff
!=
""
:
output
.
append
(
buff
)
return
output
build/lib/paddlepalm/utils/__init__.py
已删除
100644 → 0
浏览文件 @
e2368644
build/lib/paddlepalm/utils/config_helper.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
sys
import
argparse
import
json
import
yaml
import
six
import
logging
logging_only_message
=
"%(message)s"
logging_details
=
"%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
class
JsonConfig
(
object
):
"""
A high-level api for handling json configure file.
"""
def
__init__
(
self
,
config_path
):
self
.
_config_dict
=
self
.
_parse
(
config_path
)
def
_parse
(
self
,
config_path
):
try
:
with
open
(
config_path
)
as
json_file
:
config_dict
=
json
.
load
(
json_file
)
assert
isinstance
(
config_dict
,
dict
),
"Object in {} is NOT a dict."
.
format
(
config_path
)
except
:
raise
IOError
(
"Error in parsing bert model config file '%s'"
%
config_path
)
else
:
return
config_dict
def
__getitem__
(
self
,
key
):
return
self
.
_config_dict
[
key
]
def
asdict
(
self
):
return
self
.
_config_dict
def
print_config
(
self
):
for
arg
,
value
in
sorted
(
six
.
iteritems
(
self
.
_config_dict
)):
print
(
'%s: %s'
%
(
arg
,
value
))
print
(
'------------------------------------------------'
)
class
ArgumentGroup
(
object
):
def
__init__
(
self
,
parser
,
title
,
des
):
self
.
_group
=
parser
.
add_argument_group
(
title
=
title
,
description
=
des
)
def
add_arg
(
self
,
name
,
type
,
default
,
help
,
**
kwargs
):
type
=
str2bool
if
type
==
bool
else
type
self
.
_group
.
add_argument
(
"--"
+
name
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
class
ArgConfig
(
object
):
"""
A high-level api for handling argument configs.
"""
def
__init__
(
self
):
parser
=
argparse
.
ArgumentParser
()
train_g
=
ArgumentGroup
(
parser
,
"training"
,
"training options."
)
train_g
.
add_arg
(
"epoch"
,
int
,
3
,
"Number of epoches for fine-tuning."
)
train_g
.
add_arg
(
"learning_rate"
,
float
,
5e-5
,
"Learning rate used to train with warmup."
)
train_g
.
add_arg
(
"lr_scheduler"
,
str
,
"linear_warmup_decay"
,
"scheduler of learning rate."
,
choices
=
[
'linear_warmup_decay'
,
'noam_decay'
])
train_g
.
add_arg
(
"weight_decay"
,
float
,
0.01
,
"Weight decay rate for L2 regularizer."
)
train_g
.
add_arg
(
"warmup_proportion"
,
float
,
0.1
,
"Proportion of training steps to perform linear learning rate warmup for."
)
train_g
.
add_arg
(
"save_steps"
,
int
,
1000
,
"The steps interval to save checkpoints."
)
train_g
.
add_arg
(
"loss_scaling"
,
float
,
1.0
,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
)
train_g
.
add_arg
(
"pred_dir"
,
str
,
None
,
"Path to save the prediction results"
)
log_g
=
ArgumentGroup
(
parser
,
"logging"
,
"logging related."
)
log_g
.
add_arg
(
"skip_steps"
,
int
,
10
,
"The steps interval to print loss."
)
log_g
.
add_arg
(
"verbose"
,
bool
,
False
,
"Whether to output verbose log."
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"use_fast_executor"
,
bool
,
False
,
"If set, use fast parallel executor (in experiment)."
)
run_type_g
.
add_arg
(
"num_iteration_per_drop_scope"
,
int
,
1
,
"Ihe iteration intervals to clean up temporary variables."
)
run_type_g
.
add_arg
(
"do_train"
,
bool
,
True
,
"Whether to perform training."
)
run_type_g
.
add_arg
(
"do_predict"
,
bool
,
True
,
"Whether to perform prediction."
)
custom_g
=
ArgumentGroup
(
parser
,
"customize"
,
"customized options."
)
self
.
custom_g
=
custom_g
self
.
parser
=
parser
def
add_arg
(
self
,
name
,
dtype
,
default
,
descrip
):
self
.
custom_g
.
add_arg
(
name
,
dtype
,
default
,
descrip
)
def
build_conf
(
self
):
return
self
.
parser
.
parse_args
()
def
str2bool
(
v
):
# because argparse does not support to parse "true, False" as python
# boolean directly
return
v
.
lower
()
in
(
"true"
,
"t"
,
"1"
)
def
print_arguments
(
args
,
log
=
None
):
if
not
log
:
print
(
'----------- Configuration Arguments -----------'
)
for
arg
,
value
in
sorted
(
six
.
iteritems
(
vars
(
args
))):
print
(
'%s: %s'
%
(
arg
,
value
))
print
(
'------------------------------------------------'
)
else
:
log
.
info
(
'----------- Configuration Arguments -----------'
)
for
arg
,
value
in
sorted
(
six
.
iteritems
(
vars
(
args
))):
log
.
info
(
'%s: %s'
%
(
arg
,
value
))
log
.
info
(
'------------------------------------------------'
)
class
PDConfig
(
object
):
"""
A high-level API for managing configuration files in PaddlePaddle.
Can jointly work with command-line-arugment, json files and yaml files.
"""
def
__init__
(
self
,
json_file
=
None
,
yaml_file
=
None
,
fuse_args
=
True
):
"""
Init funciton for PDConfig.
json_file: the path to the json configure file.
yaml_file: the path to the yaml configure file.
fuse_args: if fuse the json/yaml configs with argparse.
"""
if
json_file
is
not
None
and
yaml_file
is
not
None
:
raise
Warning
(
"json_file and yaml_file can not co-exist for now. please only use one configure file type."
)
return
self
.
args
=
None
self
.
arg_config
=
{}
self
.
json_config
=
{}
self
.
yaml_config
=
{}
parser
=
argparse
.
ArgumentParser
()
self
.
yaml_g
=
ArgumentGroup
(
parser
,
"yaml"
,
"options from yaml."
)
self
.
json_g
=
ArgumentGroup
(
parser
,
"json"
,
"options from json."
)
self
.
com_g
=
ArgumentGroup
(
parser
,
"custom"
,
"customized options."
)
self
.
parser
=
parser
if
json_file
is
not
None
:
assert
isinstance
(
json_file
,
str
)
self
.
load_json
(
json_file
,
fuse_args
=
fuse_args
)
if
yaml_file
is
not
None
:
assert
isinstance
(
yaml_file
,
str
)
or
isinstance
(
yaml_file
,
list
)
self
.
load_yaml
(
yaml_file
,
fuse_args
=
fuse_args
)
def
load_json
(
self
,
file_path
,
fuse_args
=
True
):
if
not
os
.
path
.
exists
(
file_path
):
raise
Warning
(
"the json file %s does not exist."
%
file_path
)
return
with
open
(
file_path
,
"r"
)
as
fin
:
self
.
json_config
=
json
.
loads
(
fin
.
read
())
fin
.
close
()
if
fuse_args
:
for
name
in
self
.
json_config
:
if
not
isinstance
(
self
.
json_config
[
name
],
int
)
\
and
not
isinstance
(
self
.
json_config
[
name
],
float
)
\
and
not
isinstance
(
self
.
json_config
[
name
],
str
)
\
and
not
isinstance
(
self
.
json_config
[
name
],
bool
):
continue
self
.
json_g
.
add_arg
(
name
,
type
(
self
.
json_config
[
name
]),
self
.
json_config
[
name
],
"This is from %s"
%
file_path
)
def
load_yaml
(
self
,
file_path_list
,
fuse_args
=
True
):
if
isinstance
(
file_path_list
,
str
):
file_path_list
=
[
file_path_list
]
for
file_path
in
file_path_list
:
if
not
os
.
path
.
exists
(
file_path
):
raise
Warning
(
"the yaml file %s does not exist."
%
file_path
)
return
with
open
(
file_path
,
"r"
)
as
fin
:
self
.
yaml_config
=
yaml
.
load
(
fin
,
Loader
=
yaml
.
SafeLoader
)
if
fuse_args
:
for
name
in
self
.
yaml_config
:
if
not
isinstance
(
self
.
yaml_config
[
name
],
int
)
\
and
not
isinstance
(
self
.
yaml_config
[
name
],
float
)
\
and
not
isinstance
(
self
.
yaml_config
[
name
],
str
)
\
and
not
isinstance
(
self
.
yaml_config
[
name
],
bool
):
continue
self
.
yaml_g
.
add_arg
(
name
,
type
(
self
.
yaml_config
[
name
]),
self
.
yaml_config
[
name
],
"This is from %s"
%
file_path
)
def
build
(
self
):
self
.
args
=
self
.
parser
.
parse_args
()
self
.
arg_config
=
vars
(
self
.
args
)
def
asdict
(
self
):
return
self
.
arg_config
def
__add__
(
self
,
new_arg
):
assert
isinstance
(
new_arg
,
list
)
or
isinstance
(
new_arg
,
tuple
)
assert
len
(
new_arg
)
>=
3
assert
self
.
args
is
None
name
=
new_arg
[
0
]
dtype
=
new_arg
[
1
]
dvalue
=
new_arg
[
2
]
desc
=
new_arg
[
3
]
if
len
(
new_arg
)
==
4
else
"Description is not provided."
self
.
com_g
.
add_arg
(
name
,
dtype
,
dvalue
,
desc
)
return
self
def
__getattr__
(
self
,
name
):
if
name
in
self
.
arg_config
:
return
self
.
arg_config
[
name
]
if
name
in
self
.
json_config
:
return
self
.
json_config
[
name
]
if
name
in
self
.
yaml_config
:
return
self
.
yaml_config
[
name
]
raise
Warning
(
"The argument %s is not defined."
%
name
)
def
Print
(
self
):
print
(
"-"
*
70
)
for
name
in
self
.
arg_config
:
print
(
"{: <25}
\t
{}"
.
format
(
str
(
name
),
str
(
self
.
arg_config
[
name
])))
for
name
in
self
.
json_config
:
if
name
not
in
self
.
arg_config
:
print
(
"{: <25}
\t
{}"
%
(
str
(
name
),
str
(
self
.
json_config
[
name
])))
for
name
in
self
.
yaml_config
:
if
name
not
in
self
.
arg_config
:
print
(
"{: <25}
\t
{}"
%
(
str
(
name
),
str
(
self
.
yaml_config
[
name
])))
print
(
"-"
*
70
)
if
__name__
==
"__main__"
:
pd_config
=
PDConfig
(
yaml_file
=
"./test/bert_config.yaml"
)
pd_config
+=
(
"my_age"
,
int
,
18
,
"I am forever 18."
)
pd_config
.
build
()
print
(
pd_config
.
do_train
)
print
(
pd_config
.
hidden_size
)
print
(
pd_config
.
my_age
)
build/lib/paddlepalm/utils/print_helper.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MAXLEN
=
70
def
print_dict
(
dic
,
title
=
""
):
if
title
:
title
=
' '
+
title
+
' '
left_len
=
(
MAXLEN
-
len
(
title
))
//
2
title
=
'-'
*
left_len
+
title
right_len
=
MAXLEN
-
len
(
title
)
title
=
title
+
'-'
*
right_len
else
:
title
=
'-'
*
MAXLEN
print
(
title
)
for
name
in
dic
:
print
(
"{: <25}
\t
{}"
.
format
(
str
(
name
),
str
(
dic
[
name
])))
print
(
""
)
# print("-" * MAXLEN + '\n')
build/lib/paddlepalm/utils/reader_helper.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
random
import
numpy
as
np
import
paddle
from
paddle
import
fluid
from
paddle.fluid
import
layers
def
_check_and_adapt_shape_dtype
(
rt_val
,
attr
):
if
not
isinstance
(
rt_val
,
np
.
ndarray
):
rt_val
=
np
.
array
(
rt_val
)
assert
rt_val
.
dtype
!=
np
.
dtype
(
'O'
),
"yielded data is not a valid tensor(number of elements on some dimension may differ)."
if
rt_val
.
dtype
==
np
.
dtype
(
'float64'
):
rt_val
=
rt_val
.
astype
(
'float32'
)
shape
,
dtype
=
attr
assert
rt_val
.
dtype
==
np
.
dtype
(
dtype
),
"yielded data type not consistent with attr settings."
assert
len
(
shape
)
==
rt_val
.
ndim
,
"yielded data rank(ndim) not consistent with attr settings."
for
rt
,
exp
in
zip
(
rt_val
.
shape
,
shape
):
if
exp
is
None
or
exp
<
0
:
continue
assert
rt
==
exp
,
"yielded data shape is not consistent with attr settings.
\n
Expected:{}
\n
Actual:{}"
.
format
(
exp
,
rt
)
return
rt_val
def
_zero_batch
(
attrs
):
pos_attrs
=
[]
for
shape
,
dtype
in
attrs
:
pos_shape
=
[
size
if
size
and
size
>
0
else
1
for
size
in
shape
]
pos_attrs
.
append
([
pos_shape
,
dtype
])
return
[
np
.
zeros
(
shape
=
shape
,
dtype
=
dtype
)
for
shape
,
dtype
in
pos_attrs
]
def
_zero_batch_x
(
attrs
,
batch_size
):
pos_attrs
=
[]
for
shape
,
dtype
in
attrs
:
# pos_shape = [size if size and size > 0 else 5 for size in shape]
pos_shape
=
[
size
for
size
in
shape
]
if
pos_shape
[
0
]
==
-
1
:
pos_shape
[
0
]
=
batch_size
if
pos_shape
[
1
]
==
-
1
:
pos_shape
[
1
]
=
512
# max seq len
pos_attrs
.
append
([
pos_shape
,
dtype
])
return
[
np
.
zeros
(
shape
=
shape
,
dtype
=
dtype
)
for
shape
,
dtype
in
pos_attrs
]
def
create_net_inputs
(
input_attrs
,
async
=
False
,
iterator_fn
=
None
,
dev_count
=
1
,
n_prefetch
=
1
):
inputs
=
[]
ret
=
{}
for
name
,
shape
,
dtype
in
input_attrs
:
p
=
layers
.
data
(
name
,
shape
=
shape
,
dtype
=
dtype
)
ret
[
name
]
=
p
inputs
.
append
(
p
)
if
async
:
assert
iterator_fn
is
not
None
,
"iterator_fn is needed for building async input layer."
reader
=
fluid
.
io
.
PyReader
(
inputs
,
capacity
=
dev_count
*
n_prefetch
,
iterable
=
False
)
reader
.
decorate_batch_generator
(
iterator_fn
)
reader
.
start
()
return
ret
def
create_iterator_fn
(
iterator
,
iterator_prefix
,
shape_and_dtypes
,
outname_to_pos
,
verbose
=
0
):
def
iterator
():
v
=
verbose
while
True
:
results
=
_zero_batch
(
shape_and_dtypes
)
outputs
=
next
(
iterator
)
# dict type
prefix
=
iterator_prefixe
for
outname
,
val
in
outputs
.
items
():
task_outname
=
prefix
+
'/'
+
outname
if
outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
outname
]
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
if
task_outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
task_outname
]
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
yield
results
return
iterator
def
create_joint_iterator_fn
(
iterators
,
iterator_prefixes
,
joint_shape_and_dtypes
,
mrs
,
outname_to_pos
,
dev_count
=
1
,
keep_one_task
=
True
,
verbose
=
0
,
batch_size
=
None
):
"""
joint_shape_and_dtypes: 本质上是根据bb和parad的attr设定的,并且由reader中的attr自动填充-1(可变)维度得到,因此通过与iterator的校验可以完成runtime的batch正确性检查
"""
task_ids
=
range
(
len
(
iterators
))
weights
=
[
mr
/
float
(
sum
(
mrs
))
for
mr
in
mrs
]
if
not
keep_one_task
:
dev_count
=
1
# build fake batch
# 注意这种方法会导致一个问题,用户将某任务的mix ratio设置成0后,并不能避免从该任务上读数据,若用户将数据集删掉则会导致崩溃;不过相比之前的zero batch方法,这种方法不必作出只能有一个size=-1的维度且第0维的-1必须是batch size的假设
results
=
_zero_batch
(
joint_shape_and_dtypes
)
outbuf
=
{}
for
id
in
task_ids
:
outputs
=
next
(
iterators
[
id
])
# dict type
outbuf
[
id
]
=
outputs
prefix
=
iterator_prefixes
[
id
]
for
outname
,
val
in
outputs
.
items
():
task_outname
=
prefix
+
'/'
+
outname
if
outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
outname
]
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
if
task_outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
task_outname
]
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
fake_batch
=
results
dev_count_bak
=
dev_count
def
iterator
():
v
=
verbose
while
True
:
id
=
np
.
random
.
choice
(
task_ids
,
p
=
weights
)
results
=
fake_batch
if
v
>
0
:
print
(
'----- debug joint iterator -----'
)
print
(
'sampled task id: '
+
str
(
id
))
task_id_tensor
=
np
.
array
([[
id
]]).
astype
(
"int64"
)
results
[
0
]
=
task_id_tensor
for
i
in
range
(
dev_count
):
# results = _zero_batch(joint_shape_and_dtypes, batch_size=batch_size)
# results[0] = task_id_tensor
if
id
in
outbuf
:
outputs
=
outbuf
[
id
]
del
outbuf
[
id
]
else
:
outputs
=
next
(
iterators
[
id
])
# dict type
prefix
=
iterator_prefixes
[
id
]
for
outname
,
val
in
outputs
.
items
():
if
v
>
0
:
print
(
'reader generate: '
+
outname
)
task_outname
=
prefix
+
'/'
+
outname
if
outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
outname
]
if
v
>
0
:
print
(
outname
+
' is insert in idx '
+
str
(
idx
))
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
if
task_outname
in
outname_to_pos
:
idx
=
outname_to_pos
[
task_outname
]
if
v
>
0
:
print
(
task_outname
+
' is insert in idx '
+
str
(
idx
))
val
=
_check_and_adapt_shape_dtype
(
val
,
joint_shape_and_dtypes
[
idx
])
results
[
idx
]
=
val
if
v
>
0
:
print
(
'yielded batch len and shapes:'
)
print
(
len
(
results
))
for
i
in
results
:
print
(
np
.
shape
(
i
))
print
(
''
)
v
-=
1
yield
results
return
iterator
def
merge_input_attrs
(
backbone_attr
,
task_attrs
,
insert_taskid
=
True
):
"""
Args:
task_attrs(list[dict]|dict): task input attributes, key=attr_name, val=[shape, dtype], support single task and nested tasks
"""
if
isinstance
(
task_attrs
,
dict
):
task_attrs
=
[
task_attrs
]
if
insert_taskid
:
ret
=
[([
1
,
1
],
'int64'
)]
names
=
[
'__task_id'
]
start
=
1
else
:
ret
=
[]
names
=
[]
start
=
0
names
+=
sorted
(
backbone_attr
.
keys
())
ret
.
extend
([
backbone_attr
[
k
]
for
k
in
names
[
start
:]])
name_to_position
=
{}
# pos=0 is for task_id, thus we start from 1
for
pos
,
k
in
enumerate
(
names
):
name_to_position
[
k
]
=
pos
for
task_attr
in
task_attrs
:
task_names
=
sorted
(
task_attr
.
keys
())
names
.
extend
(
task_names
)
ret
.
extend
([
task_attr
[
k
]
for
k
in
task_names
])
for
pos
,
k
in
enumerate
(
task_names
,
start
=
len
(
name_to_position
)):
name_to_position
[
k
]
=
pos
return
names
,
ret
,
name_to_position
build/lib/paddlepalm/utils/saver.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
six
import
ast
import
copy
import
numpy
as
np
import
paddle.fluid
as
fluid
def
init_checkpoint
(
exe
,
init_checkpoint_path
,
main_program
,
skip_list
=
[]):
assert
os
.
path
.
exists
(
init_checkpoint_path
),
"[%s] cann't be found."
%
init_checkpoint_path
def
existed_persitables
(
var
):
if
not
fluid
.
io
.
is_persistable
(
var
):
return
False
if
var
.
name
in
skip_list
:
return
False
return
os
.
path
.
exists
(
os
.
path
.
join
(
init_checkpoint_path
,
var
.
name
))
fluid
.
io
.
load_vars
(
exe
,
init_checkpoint_path
,
main_program
=
main_program
,
predicate
=
existed_persitables
)
print
(
"Load model from {}"
.
format
(
init_checkpoint_path
))
def
init_pretraining_params
(
exe
,
pretraining_params_path
,
main_program
):
assert
os
.
path
.
exists
(
pretraining_params_path
),
"[%s] cann't be found."
%
pretraining_params_path
def
existed_params
(
var
):
if
not
isinstance
(
var
,
fluid
.
framework
.
Parameter
):
return
False
return
os
.
path
.
exists
(
os
.
path
.
join
(
pretraining_params_path
,
var
.
name
))
print
(
"Load pretraining parameters from {}...
\n
"
.
format
(
pretraining_params_path
))
fluid
.
io
.
load_vars
(
exe
,
pretraining_params_path
,
main_program
=
main_program
,
predicate
=
existed_params
)
build/lib/paddlepalm/utils/textprocess_helper.py
已删除
100644 → 0
浏览文件 @
e2368644
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
dist/paddle_palm-1.2-py2.7.egg
已删除
100644 → 0
浏览文件 @
e2368644
文件已删除
paddle_palm.egg-info/PKG-INFO
已删除
100644 → 0
浏览文件 @
e2368644
Metadata-Version: 1.1
Name: paddle-palm
Version: 1.2
Summary: A Multi-task Learning Lib for PaddlePaddle Users.
Home-page: https://github.com/PaddlePadd
Author: PaddlePaddle
Author-email: zhangyiming04@baidu.com
License: Apache 2.0
Description-Content-Type: text/markdown
Description:
# 多任务学习框架PaddlePALM
# 安装
pip install paddlepalm
# 使用
### 1. 创建任务实例
使用yaml格式描述任务实例,每个任务实例中的必选字段包括
- train_file: 训练集文件路径
- reader: 数据集载入与处理工具名,框架预置reader列表见[这里](https://www.baidu.com/)
- backbone: 骨架模型名,框架预置reader列表见[这里](https://www.baidu.com/)
- paradigm: 任务范式(类型)名,框架预置paradigm列表见[这里](https://www.baidu.com/)
### 2. 完成训练配置
使用yaml格式完成配置多任务学习中的相关参数,如指定任务实例及其相关的主辅关系、参数复用关系、采样权重等
### 3. 开始训练
```python
import paddlepalm as palm
if __name__ == '__main__':
controller = palm.Controller('config.yaml', task_dir='task_instance')
controller.load_pretrain('pretrain_model/ernie/params')
controller.train()
```
### 4. 预测
用户可在训练结束后直接调用pred接口对某个目标任务进行预测
示例:
```python
import paddlepalm as palm
if __name__ == '__main__':
controller = palm.Controller(config_path='config.yaml', task_dir='task_instance')
controller.load_pretrain('pretrain_model/ernie/params')
controller.train()
controller.pred('mrqa')
```
也可新建controller直接预测
```python
import paddlepalm as palm
if __name__ == '__main__':
controller = palm.Controller(config_path='config.yaml', task_dir='task_instance')
controller.pred('mrqa', infermodel_path='output_model/firstrun2/infer_model')
```
# 运行机制
### 多任务学习机制
pass
### 训练终止机制
- 默认的设置:
- **所有target任务达到目标训练步数后多任务学习停止**
- 未设置成target任务的任务(即辅助任务)不会影响训练终止与否,只是担任”陪训“的角色
- 注:默认所有的任务都是target任务,用户可以通过`target_tag`来标记目标/辅助任务
- 每个目标任务的目标训练步数由num_epochs和mix_ratio计算得到
### 保存机制
- 默认的设置:
- 训练过程中,保存下来的模型分为checkpoint (ckpt)和inference model (infermodel)两种:
- ckpt保存的是包含所有任务的总计算图(即整个多任务学习计算图),用于训练中断恢复
- infermodel保存的是某个目标任务的推理计算图和推理依赖的相关配置
- 对于每个target任务,训练到预期的步数后自动保存inference model,之后不再保存。(注:保存inference model不影响ckpt的保存)
- 用户可改配置
- 使用`save_ckpt_every_steps`来控制保存ckpt的频率,默认不保存
- 每个task instance均可使用`save_infermodel_every_steps`来控制该task保存infermodel的频率,默认为-1,即只在达到目标训练步数时保存一下
Keywords: paddlepaddle,paddle,multi-task-learning
Platform: any
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
paddle_palm.egg-info/SOURCES.txt
已删除
100644 → 0
浏览文件 @
e2368644
README.md
setup.cfg
setup.py
./paddlepalm/__init__.py
./paddlepalm/default_settings.py
./paddlepalm/interface.py
./paddlepalm/mtl_controller.py
./paddlepalm/task_instance.py
./paddlepalm/backbone/__init__.py
./paddlepalm/backbone/bert.py
./paddlepalm/backbone/bow.py
./paddlepalm/backbone/ernie.py
./paddlepalm/backbone/utils/__init__.py
./paddlepalm/backbone/utils/transformer.py
./paddlepalm/optimizer/__init__.py
./paddlepalm/optimizer/adam.py
./paddlepalm/reader/__init__.py
./paddlepalm/reader/cls4bert.py
./paddlepalm/reader/match4ernie.py
./paddlepalm/reader/mlm.py
./paddlepalm/reader/mrc4bert.py
./paddlepalm/reader/mrc4ernie.py
./paddlepalm/reader/utils/__init__.py
./paddlepalm/reader/utils/batching4bert.py
./paddlepalm/reader/utils/batching4ernie.py
./paddlepalm/reader/utils/mlm_batching.py
./paddlepalm/reader/utils/mrqa_helper.py
./paddlepalm/reader/utils/reader4ernie.py
./paddlepalm/task_paradigm/__init__.py
./paddlepalm/task_paradigm/cls.py
./paddlepalm/task_paradigm/match.py
./paddlepalm/task_paradigm/mlm.py
./paddlepalm/task_paradigm/mrc.py
./paddlepalm/tokenizer/__init__.py
./paddlepalm/tokenizer/bert_tokenizer.py
./paddlepalm/tokenizer/ernie_tokenizer.py
./paddlepalm/utils/__init__.py
./paddlepalm/utils/config_helper.py
./paddlepalm/utils/print_helper.py
./paddlepalm/utils/reader_helper.py
./paddlepalm/utils/saver.py
./paddlepalm/utils/textprocess_helper.py
paddle_palm.egg-info/PKG-INFO
paddle_palm.egg-info/SOURCES.txt
paddle_palm.egg-info/dependency_links.txt
paddle_palm.egg-info/not-zip-safe
paddle_palm.egg-info/top_level.txt
\ No newline at end of file
paddle_palm.egg-info/dependency_links.txt
已删除
100644 → 0
浏览文件 @
e2368644
paddle_palm.egg-info/not-zip-safe
已删除
100644 → 0
浏览文件 @
e2368644
paddle_palm.egg-info/top_level.txt
已删除
100644 → 0
浏览文件 @
e2368644
paddlepalm
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录