Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSlim
提交
e0df8292
P
PaddleSlim
项目概览
PaddlePaddle
/
PaddleSlim
大约 2 年 前同步成功
通知
51
Star
1434
Fork
344
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
16
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSlim
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
16
合并请求
16
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e0df8292
编写于
4月 13, 2020
作者:
W
wanghaoshuang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add teacher model of bert.
上级
84d76531
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
3860 addition
and
0 deletion
+3860
-0
paddleslim/teachers/bert/__init__.py
paddleslim/teachers/bert/__init__.py
+20
-0
paddleslim/teachers/bert/batching.py
paddleslim/teachers/bert/batching.py
+189
-0
paddleslim/teachers/bert/cls.py
paddleslim/teachers/bert/cls.py
+243
-0
paddleslim/teachers/bert/model/__init__.py
paddleslim/teachers/bert/model/__init__.py
+0
-0
paddleslim/teachers/bert/model/bert.py
paddleslim/teachers/bert/model/bert.py
+269
-0
paddleslim/teachers/bert/model/cls.py
paddleslim/teachers/bert/model/cls.py
+99
-0
paddleslim/teachers/bert/model/transformer_encoder.py
paddleslim/teachers/bert/model/transformer_encoder.py
+398
-0
paddleslim/teachers/bert/optimization.py
paddleslim/teachers/bert/optimization.py
+170
-0
paddleslim/teachers/bert/reader/__init__.py
paddleslim/teachers/bert/reader/__init__.py
+0
-0
paddleslim/teachers/bert/reader/cls.py
paddleslim/teachers/bert/reader/cls.py
+552
-0
paddleslim/teachers/bert/reader/pretraining.py
paddleslim/teachers/bert/reader/pretraining.py
+289
-0
paddleslim/teachers/bert/reader/squad.py
paddleslim/teachers/bert/reader/squad.py
+935
-0
paddleslim/teachers/bert/tokenization.py
paddleslim/teachers/bert/tokenization.py
+371
-0
paddleslim/teachers/bert/utils/__init__.py
paddleslim/teachers/bert/utils/__init__.py
+0
-0
paddleslim/teachers/bert/utils/convert_static_to_dygraph.py
paddleslim/teachers/bert/utils/convert_static_to_dygraph.py
+228
-0
paddleslim/teachers/bert/utils/fp16.py
paddleslim/teachers/bert/utils/fp16.py
+97
-0
未找到文件。
paddleslim/teachers/bert/__init__.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
..bert
import
cls
from
.cls
import
*
__all__
=
[]
__all__
+=
cls
.
__all__
paddleslim/teachers/bert/batching.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
def
mask
(
batch_tokens
,
total_token_num
,
vocab_size
,
CLS
=
1
,
SEP
=
2
,
MASK
=
3
):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len
=
max
([
len
(
sent
)
for
sent
in
batch_tokens
])
mask_label
=
[]
mask_pos
=
[]
prob_mask
=
np
.
random
.
rand
(
total_token_num
)
# Note: the first token is [CLS], so [low=1]
replace_ids
=
np
.
random
.
randint
(
1
,
high
=
vocab_size
,
size
=
total_token_num
)
pre_sent_len
=
0
prob_index
=
0
for
sent_index
,
sent
in
enumerate
(
batch_tokens
):
mask_flag
=
False
prob_index
+=
pre_sent_len
for
token_index
,
token
in
enumerate
(
sent
):
prob
=
prob_mask
[
prob_index
+
token_index
]
if
prob
>
0.15
:
continue
elif
0.03
<
prob
<=
0.15
:
# mask
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
elif
0.015
<
prob
<=
0.03
:
# random replace
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
replace_ids
[
prob_index
+
token_index
]
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
else
:
# keep the original token
if
token
!=
SEP
and
token
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
pre_sent_len
=
len
(
sent
)
# ensure at least mask one word in a sentence
while
not
mask_flag
:
token_index
=
int
(
np
.
random
.
randint
(
1
,
high
=
len
(
sent
)
-
1
,
size
=
1
))
if
sent
[
token_index
]
!=
SEP
and
sent
[
token_index
]
!=
CLS
:
mask_label
.
append
(
sent
[
token_index
])
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
return
batch_tokens
,
mask_label
,
mask_pos
def
prepare_batch_data
(
insts
,
total_token_num
,
voc_size
=
0
,
pad_id
=
None
,
cls_id
=
None
,
sep_id
=
None
,
mask_id
=
None
,
return_input_mask
=
True
,
return_max_len
=
True
,
return_num_token
=
False
):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids
=
[
inst
[
0
]
for
inst
in
insts
]
batch_sent_ids
=
[
inst
[
1
]
for
inst
in
insts
]
batch_pos_ids
=
[
inst
[
2
]
for
inst
in
insts
]
labels_list
=
[]
# compatible with squad, whose example includes start/end positions,
# or unique id
for
i
in
range
(
3
,
len
(
insts
[
0
]),
1
):
labels
=
[
inst
[
i
]
for
inst
in
insts
]
labels
=
np
.
array
(
labels
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
labels_list
.
append
(
labels
)
# First step: do mask without padding
if
mask_id
>=
0
:
out
,
mask_label
,
mask_pos
=
mask
(
batch_src_ids
,
total_token_num
,
vocab_size
=
voc_size
,
CLS
=
cls_id
,
SEP
=
sep_id
,
MASK
=
mask_id
)
else
:
out
=
batch_src_ids
# Second step: padding
src_id
,
self_input_mask
=
pad_batch_data
(
out
,
pad_idx
=
pad_id
,
return_input_mask
=
True
)
pos_id
=
pad_batch_data
(
batch_pos_ids
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
sent_id
=
pad_batch_data
(
batch_sent_ids
,
pad_idx
=
pad_id
,
return_pos
=
False
,
return_input_mask
=
False
)
if
mask_id
>=
0
:
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_input_mask
,
mask_label
,
mask_pos
]
+
labels_list
else
:
return_list
=
[
src_id
,
pos_id
,
sent_id
,
self_input_mask
]
+
labels_list
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
def
pad_batch_data
(
insts
,
pad_idx
=
0
,
return_pos
=
False
,
return_input_mask
=
False
,
return_max_len
=
False
,
return_num_token
=
False
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data
=
np
.
array
([
list
(
inst
)
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
input_mask_data
=
np
.
expand_dims
(
input_mask_data
,
axis
=-
1
)
return_list
+=
[
input_mask_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
if
return_num_token
:
num_token
=
0
for
inst
in
insts
:
num_token
+=
len
(
inst
)
return_list
+=
[
num_token
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
if
__name__
==
"__main__"
:
pass
paddleslim/teachers/bert/cls.py
0 → 100755
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
six
import
sys
if
six
.
PY2
:
reload
(
sys
)
sys
.
setdefaultencoding
(
'utf8'
)
import
ast
import
time
import
argparse
import
numpy
as
np
import
multiprocessing
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
to_variable
,
Layer
import
reader.cls
as
reader
from
.model.bert
import
BertConfig
from
.model.cls
import
ClsModelLayer
from
.optimization
import
Optimizer
from
.utils.args
import
ArgumentGroup
,
print_arguments
,
check_cuda
from
.utils.init
import
init_from_static_model
__all__
=
[
"BERTClassifier"
]
def
create_data
(
batch
):
"""
convert data to variable
"""
src_ids
=
to_variable
(
batch
[
0
],
"src_ids"
)
position_ids
=
to_variable
(
batch
[
1
],
"position_ids"
)
sentence_ids
=
to_variable
(
batch
[
2
],
"sentence_ids"
)
input_mask
=
to_variable
(
batch
[
3
],
"input_mask"
)
labels
=
to_variable
(
batch
[
4
],
"labels"
)
labels
.
stop_gradient
=
True
return
src_ids
,
position_ids
,
sentence_ids
,
input_mask
,
labels
class
BERTClassifier
(
Layer
):
def
__init__
(
self
,
num_labels
,
task_name
=
"mnli"
,
model_path
=
None
,
use_cuda
=
True
):
super
(
BERTClassifier
,
self
).
__init__
()
self
.
task_name
=
task_name
.
lower
()
BERT_BASE_PATH
=
"./data/pretrained_models/uncased_L-12_H-768_A-12/"
bert_config_path
=
BERT_BASE_PATH
+
"/bert_config.json"
self
.
vocab_path
=
BERT_BASE_PATH
+
"/vocab.txt"
self
.
init_pretraining_params
=
BERT_BASE_PATH
+
"/dygraph_params/"
self
.
do_lower_case
=
True
self
.
bert_config
=
BertConfig
(
bert_config_path
)
if
use_cuda
:
self
.
dev_count
=
fluid
.
core
.
get_cuda_device_count
()
else
:
self
.
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
trainer_count
=
fluid
.
dygraph
.
parallel
.
Env
().
nranks
self
.
processors
=
{
'xnli'
:
reader
.
XnliProcessor
,
'cola'
:
reader
.
ColaProcessor
,
'mrpc'
:
reader
.
MrpcProcessor
,
'mnli'
:
reader
.
MnliProcessor
,
}
self
.
cls_model
=
ClsModelLayer
(
self
.
bert_config
,
num_labels
,
return_pooled_out
=
True
)
if
self
.
init_pretraining_params
:
print
(
"Load pre-trained model from %s"
%
self
.
init_pretraining_params
)
init_from_static_model
(
self
.
init_pretraining_params
,
self
.
cls_model
,
self
.
bert_config
)
if
model_path
is
not
None
:
#restore the model
print
(
"Load params from %s"
%
model_path
)
model_dict
,
_
=
fluid
.
load_dygraph
(
model_path
)
self
.
cls_model
.
load_dict
(
model_dict
)
def
forward
(
self
,
input
):
return
self
.
cls_model
(
input
)
def
test
(
self
,
data_dir
,
batch_size
=
64
,
max_seq_len
=
512
):
processor
=
self
.
processors
[
self
.
task_name
](
data_dir
=
data_dir
,
vocab_path
=
self
.
vocab_path
,
max_seq_len
=
max_seq_len
,
do_lower_case
=
self
.
do_lower_case
,
in_tokens
=
False
)
test_data_generator
=
processor
.
data_generator
(
batch_size
=
batch_size
,
phase
=
'dev'
,
epoch
=
1
,
shuffle
=
False
)
self
.
cls_model
.
eval
()
total_cost
,
final_acc
,
avg_acc
,
total_num_seqs
=
[],
[],
[],
[]
for
batch
in
test_data_generator
():
data_ids
=
create_data
(
batch
)
total_loss
,
_
,
_
,
np_acces
,
np_num_seqs
=
self
.
cls_model
(
data_ids
)
np_loss
=
total_loss
.
numpy
()
np_acc
=
np_acces
[
-
1
].
numpy
()
np_avg_acc
=
np
.
mean
([
acc
.
numpy
()
for
acc
in
np_acces
])
np_num_seqs
=
np_num_seqs
.
numpy
()
total_cost
.
extend
(
np_loss
*
np_num_seqs
)
final_acc
.
extend
(
np_acc
*
np_num_seqs
)
avg_acc
.
extend
(
np_avg_acc
*
np_num_seqs
)
total_num_seqs
.
extend
(
np_num_seqs
)
print
(
"[evaluation] classifier[-1] average acc: %f; average acc: %f"
%
(
np
.
sum
(
final_acc
)
/
np
.
sum
(
total_num_seqs
),
np
.
sum
(
avg_acc
)
/
np
.
sum
(
total_num_seqs
)))
self
.
cls_model
.
train
()
def
fit
(
self
,
data_dir
,
epoch
,
batch_size
=
64
,
use_cuda
=
True
,
max_seq_len
=
512
,
warmup_proportion
=
0.1
,
use_data_parallel
=
False
,
learning_rate
=
0.0001
,
weight_decay
=
0.01
,
lr_scheduler
=
"linear_warmup_decay"
,
skip_steps
=
10
,
save_steps
=
10000
,
checkpoints
=
"checkpoints"
):
processor
=
self
.
processors
[
self
.
task_name
](
data_dir
=
data_dir
,
vocab_path
=
self
.
vocab_path
,
max_seq_len
=
max_seq_len
,
do_lower_case
=
self
.
do_lower_case
,
in_tokens
=
False
,
random_seed
=
5512
)
shuffle_seed
=
1
if
self
.
trainer_count
>
1
else
None
train_data_generator
=
processor
.
data_generator
(
batch_size
=
batch_size
,
phase
=
'train'
,
epoch
=
epoch
,
dev_count
=
self
.
trainer_count
,
shuffle
=
True
,
shuffle_seed
=
shuffle_seed
)
num_train_examples
=
processor
.
get_num_examples
(
phase
=
'train'
)
max_train_steps
=
epoch
*
num_train_examples
//
batch_size
//
self
.
trainer_count
warmup_steps
=
int
(
max_train_steps
*
warmup_proportion
)
print
(
"Device count: %d"
%
self
.
dev_count
)
print
(
"Trainer count: %d"
%
self
.
trainer_count
)
print
(
"Num train examples: %d"
%
num_train_examples
)
print
(
"Max train steps: %d"
%
max_train_steps
)
print
(
"Num warmup steps: %d"
%
warmup_steps
)
if
use_data_parallel
:
strategy
=
fluid
.
dygraph
.
parallel
.
prepare_context
()
optimizer
=
Optimizer
(
warmup_steps
=
warmup_steps
,
num_train_steps
=
max_train_steps
,
learning_rate
=
learning_rate
,
model_cls
=
self
.
cls_model
,
weight_decay
=
weight_decay
,
scheduler
=
lr_scheduler
,
loss_scaling
=
1.0
,
parameter_list
=
self
.
cls_model
.
parameters
())
if
use_data_parallel
:
self
.
cls_model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
self
.
cls_model
,
strategy
)
train_data_generator
=
fluid
.
contrib
.
reader
.
distributed_batch_reader
(
train_data_generator
)
steps
=
0
time_begin
=
time
.
time
()
for
batch
in
train_data_generator
():
data_ids
=
create_data
(
batch
)
total_loss
,
logits
,
losses
,
accuracys
,
num_seqs
=
self
.
cls_model
(
data_ids
)
optimizer
.
optimization
(
total_loss
,
use_data_parallel
=
use_data_parallel
,
model
=
self
.
cls_model
)
self
.
cls_model
.
clear_gradients
()
if
steps
!=
0
and
steps
%
skip_steps
==
0
:
time_end
=
time
.
time
()
used_time
=
time_end
-
time_begin
current_example
,
current_epoch
=
processor
.
get_train_progress
()
localtime
=
time
.
asctime
(
time
.
localtime
(
time
.
time
()))
print
(
"%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s"
%
(
localtime
,
current_epoch
,
steps
,
total_loss
.
numpy
(),
accuracys
[
-
1
].
numpy
(),
skip_steps
/
used_time
))
time_begin
=
time
.
time
()
if
steps
!=
0
and
steps
%
save_steps
==
0
and
fluid
.
dygraph
.
parallel
.
Env
(
).
local_rank
==
0
:
self
.
test
(
data_dir
,
batch_size
=
8
,
max_seq_len
=
512
)
save_path
=
os
.
path
.
join
(
checkpoints
,
"steps"
+
"_"
+
str
(
steps
))
fluid
.
save_dygraph
(
self
.
cls_model
.
state_dict
(),
save_path
)
fluid
.
save_dygraph
(
optimizer
.
optimizer
.
state_dict
(),
save_path
)
print
(
"Save model parameters and optimizer status at %s"
%
save_path
)
steps
+=
1
if
fluid
.
dygraph
.
parallel
.
Env
().
local_rank
==
0
:
save_path
=
os
.
path
.
join
(
checkpoints
,
"final"
)
fluid
.
save_dygraph
(
self
.
cls_model
.
state_dict
(),
save_path
)
fluid
.
save_dygraph
(
optimizer
.
optimizer
.
state_dict
(),
save_path
)
print
(
"Save model parameters and optimizer status at %s"
%
save_path
)
paddleslim/teachers/bert/model/__init__.py
0 → 100644
浏览文件 @
e0df8292
paddleslim/teachers/bert/model/bert.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
six
import
json
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
Embedding
,
LayerNorm
,
Linear
,
to_variable
,
Layer
,
guard
from
.transformer_encoder
import
EncoderLayer
,
PrePostProcessLayer
class
BertConfig
(
object
):
def
__init__
(
self
,
config_path
):
self
.
_config_dict
=
self
.
_parse
(
config_path
)
def
_parse
(
self
,
config_path
):
try
:
with
open
(
config_path
)
as
json_file
:
config_dict
=
json
.
load
(
json_file
)
except
Exception
:
raise
IOError
(
"Error in parsing bert model config file '%s'"
%
config_path
)
else
:
return
config_dict
def
__getitem__
(
self
,
key
):
return
self
.
_config_dict
[
key
]
def
print_config
(
self
):
for
arg
,
value
in
sorted
(
six
.
iteritems
(
self
.
_config_dict
)):
print
(
'%s: %s'
%
(
arg
,
value
))
print
(
'------------------------------------------------'
)
class
BertModelLayer
(
Layer
):
"""
bert
"""
def
__init__
(
self
,
config
,
return_pooled_out
=
True
,
use_fp16
=
False
):
super
(
BertModelLayer
,
self
).
__init__
()
self
.
_emb_size
=
config
[
'hidden_size'
]
self
.
_n_layer
=
config
[
'num_hidden_layers'
]
self
.
_n_head
=
config
[
'num_attention_heads'
]
self
.
_voc_size
=
config
[
'vocab_size'
]
self
.
_max_position_seq_len
=
config
[
'max_position_embeddings'
]
self
.
_sent_types
=
config
[
'type_vocab_size'
]
self
.
_hidden_act
=
config
[
'hidden_act'
]
self
.
_prepostprocess_dropout
=
config
[
'hidden_dropout_prob'
]
self
.
_attention_dropout
=
config
[
'attention_probs_dropout_prob'
]
self
.
return_pooled_out
=
return_pooled_out
self
.
_word_emb_name
=
"word_embedding"
self
.
_pos_emb_name
=
"pos_embedding"
self
.
_sent_emb_name
=
"sent_embedding"
self
.
_dtype
=
"float16"
if
use_fp16
else
"float32"
self
.
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
config
[
'initializer_range'
])
self
.
_src_emb
=
Embedding
(
size
=
[
self
.
_voc_size
,
self
.
_emb_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_word_emb_name
,
initializer
=
self
.
_param_initializer
),
dtype
=
self
.
_dtype
)
self
.
_pos_emb
=
Embedding
(
size
=
[
self
.
_max_position_seq_len
,
self
.
_emb_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_pos_emb_name
,
initializer
=
self
.
_param_initializer
),
dtype
=
self
.
_dtype
)
self
.
_sent_emb
=
Embedding
(
size
=
[
self
.
_sent_types
,
self
.
_emb_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
_sent_emb_name
,
initializer
=
self
.
_param_initializer
),
dtype
=
self
.
_dtype
)
self
.
pooled_fc
=
Linear
(
input_dim
=
self
.
_emb_size
,
output_dim
=
self
.
_emb_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"pooled_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
"pooled_fc.b_0"
,
act
=
"tanh"
)
self
.
pre_process_layer
=
PrePostProcessLayer
(
"nd"
,
self
.
_emb_size
,
self
.
_prepostprocess_dropout
,
""
)
self
.
_encoder
=
EncoderLayer
(
hidden_act
=
self
.
_hidden_act
,
n_layer
=
self
.
_n_layer
,
n_head
=
self
.
_n_head
,
d_key
=
self
.
_emb_size
//
self
.
_n_head
,
d_value
=
self
.
_emb_size
//
self
.
_n_head
,
d_model
=
self
.
_emb_size
,
d_inner_hid
=
self
.
_emb_size
*
4
,
prepostprocess_dropout
=
self
.
_prepostprocess_dropout
,
attention_dropout
=
self
.
_attention_dropout
,
relu_dropout
=
0
,
preprocess_cmd
=
""
,
postprocess_cmd
=
"dan"
,
param_initializer
=
self
.
_param_initializer
)
def
forward
(
self
,
src_ids
,
position_ids
,
sentence_ids
,
input_mask
):
"""
forward
"""
src_emb
=
self
.
_src_emb
(
src_ids
)
pos_emb
=
self
.
_pos_emb
(
position_ids
)
sent_emb
=
self
.
_sent_emb
(
sentence_ids
)
emb_out
=
src_emb
+
pos_emb
emb_out
=
emb_out
+
sent_emb
emb_out
=
self
.
pre_process_layer
(
emb_out
)
self_attn_mask
=
fluid
.
layers
.
matmul
(
x
=
input_mask
,
y
=
input_mask
,
transpose_y
=
True
)
self_attn_mask
=
fluid
.
layers
.
scale
(
x
=
self_attn_mask
,
scale
=
10000.0
,
bias
=-
1.0
,
bias_after_scale
=
False
)
n_head_self_attn_mask
=
fluid
.
layers
.
stack
(
x
=
[
self_attn_mask
]
*
self
.
_n_head
,
axis
=
1
)
n_head_self_attn_mask
.
stop_gradient
=
True
enc_outputs
=
self
.
_encoder
(
emb_out
,
n_head_self_attn_mask
)
if
not
self
.
return_pooled_out
:
return
enc_outputs
next_sent_feats
=
[]
for
enc_output
in
enc_outputs
:
next_sent_feat
=
fluid
.
layers
.
slice
(
input
=
enc_output
,
axes
=
[
1
],
starts
=
[
0
],
ends
=
[
1
])
next_sent_feat
=
self
.
pooled_fc
(
next_sent_feat
)
next_sent_feat
=
fluid
.
layers
.
reshape
(
next_sent_feat
,
shape
=
[
-
1
,
self
.
_emb_size
])
next_sent_feats
.
append
(
next_sent_feat
)
return
enc_outputs
,
next_sent_feats
class
PretrainModelLayer
(
Layer
):
"""
pretrain model
"""
def
__init__
(
self
,
config
,
return_pooled_out
=
True
,
weight_sharing
=
True
,
use_fp16
=
False
):
super
(
PretrainModelLayer
,
self
).
__init__
()
self
.
config
=
config
self
.
_voc_size
=
config
[
'vocab_size'
]
self
.
_emb_size
=
config
[
'hidden_size'
]
self
.
_hidden_act
=
config
[
'hidden_act'
]
self
.
_prepostprocess_dropout
=
config
[
'hidden_dropout_prob'
]
self
.
_word_emb_name
=
"word_embedding"
self
.
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
config
[
'initializer_range'
])
self
.
_weight_sharing
=
weight_sharing
self
.
use_fp16
=
use_fp16
self
.
_dtype
=
"float16"
if
use_fp16
else
"float32"
self
.
bert_layer
=
BertModelLayer
(
config
=
self
.
config
,
return_pooled_out
=
True
,
use_fp16
=
self
.
use_fp16
)
self
.
pre_process_layer
=
PrePostProcessLayer
(
"n"
,
self
.
_emb_size
,
self
.
_prepostprocess_dropout
,
"pre_encoder"
)
self
.
pooled_fc
=
Linear
(
input_dim
=
self
.
_emb_size
,
output_dim
=
self
.
_emb_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"mask_lm_trans_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
"mask_lm_trans_fc.b_0"
,
act
=
"tanh"
)
self
.
mask_lm_out_bias_attr
=
fluid
.
ParamAttr
(
name
=
"mask_lm_out_fc.b_0"
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
if
not
self
.
_weight_sharing
:
self
.
out_fc
=
Linear
(
input_dim
=
self
.
_emb_size
,
output_dim
=
self
.
_voc_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"mask_lm_out_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
self
.
mask_lm_out_bias_attr
)
else
:
self
.
fc_create_params
=
self
.
create_parameter
(
shape
=
[
self
.
_voc_size
],
dtype
=
self
.
_dtype
,
attr
=
self
.
mask_lm_out_bias_attr
,
is_bias
=
True
)
self
.
next_sent_fc
=
Linear
(
input_dim
=
self
.
_emb_size
,
output_dim
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"next_sent_fc.w_0"
,
initializer
=
self
.
_param_initializer
),
bias_attr
=
"next_sent_fc.b_0"
)
def
forward
(
self
,
src_ids
,
position_ids
,
sentence_ids
,
input_mask
,
mask_label
,
mask_pos
,
labels
):
"""
forward
"""
mask_pos
=
fluid
.
layers
.
cast
(
x
=
mask_pos
,
dtype
=
'int32'
)
enc_output
,
next_sent_feat
=
self
.
bert_layer
(
src_ids
,
position_ids
,
sentence_ids
,
input_mask
)
reshaped_emb_out
=
fluid
.
layers
.
reshape
(
x
=
enc_output
,
shape
=
[
-
1
,
self
.
_emb_size
])
mask_feat
=
fluid
.
layers
.
gather
(
input
=
reshaped_emb_out
,
index
=
mask_pos
)
mask_trans_feat
=
self
.
pooled_fc
(
mask_feat
)
mask_trans_feat
=
self
.
pre_process_layer
(
None
,
mask_trans_feat
,
"n"
,
self
.
_prepostprocess_dropout
)
if
self
.
_weight_sharing
:
fc_out
=
fluid
.
layers
.
matmul
(
x
=
mask_trans_feat
,
y
=
self
.
bert_layer
.
_src_emb
.
_w
,
transpose_y
=
True
)
fc_out
+=
self
.
fc_create_params
else
:
fc_out
=
self
.
out_fc
(
mask_trans_feat
)
mask_lm_loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc_out
,
label
=
mask_label
)
mean_mask_lm_loss
=
fluid
.
layers
.
mean
(
mask_lm_loss
)
next_sent_fc_out
=
self
.
next_sent_fc
(
next_sent_feat
)
next_sent_loss
,
next_sent_softmax
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
next_sent_fc_out
,
label
=
labels
,
return_softmax
=
True
)
next_sent_acc
=
fluid
.
layers
.
accuracy
(
input
=
next_sent_softmax
,
label
=
labels
)
mean_next_sent_loss
=
fluid
.
layers
.
mean
(
next_sent_loss
)
loss
=
mean_next_sent_loss
+
mean_mask_lm_loss
return
next_sent_acc
,
mean_mask_lm_loss
,
loss
paddleslim/teachers/bert/model/cls.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
six
import
json
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
Linear
,
Layer
from
.bert
import
BertModelLayer
class
ClsModelLayer
(
Layer
):
"""
classify model
"""
def
__init__
(
self
,
config
,
num_labels
,
is_training
=
True
,
return_pooled_out
=
True
,
loss_scaling
=
1.0
,
use_fp16
=
False
):
super
(
ClsModelLayer
,
self
).
__init__
()
self
.
config
=
config
self
.
is_training
=
is_training
self
.
use_fp16
=
use_fp16
self
.
loss_scaling
=
loss_scaling
self
.
bert_layer
=
BertModelLayer
(
config
=
self
.
config
,
return_pooled_out
=
True
,
use_fp16
=
self
.
use_fp16
)
self
.
cls_fc
=
Linear
(
input_dim
=
self
.
config
[
"hidden_size"
],
output_dim
=
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
def
forward
(
self
,
data_ids
):
"""
forward
"""
src_ids
=
data_ids
[
0
]
position_ids
=
data_ids
[
1
]
sentence_ids
=
data_ids
[
2
]
input_mask
=
data_ids
[
3
]
labels
=
data_ids
[
4
]
enc_outputs
,
next_sent_feats
=
self
.
bert_layer
(
src_ids
,
position_ids
,
sentence_ids
,
input_mask
)
logits
=
[]
losses
=
[]
accuracys
=
[]
for
next_sent_feat
in
next_sent_feats
:
cls_feat
=
fluid
.
layers
.
dropout
(
x
=
next_sent_feat
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
logit
=
self
.
cls_fc
(
cls_feat
)
logits
.
append
(
logit
)
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logit
,
label
=
labels
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
losses
.
append
(
loss
)
if
self
.
use_fp16
and
self
.
loss_scaling
>
1.0
:
loss
*=
self
.
loss_scaling
num_seqs
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
labels
,
total
=
num_seqs
)
accuracys
.
append
(
accuracy
)
total_loss
=
fluid
.
layers
.
sum
(
losses
)
return
total_loss
,
logits
,
losses
,
accuracys
,
num_seqs
paddleslim/teachers/bert/model/transformer_encoder.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
Embedding
,
LayerNorm
,
Linear
,
Layer
class
PrePostProcessLayer
(
Layer
):
"""
PrePostProcessLayer
"""
def
__init__
(
self
,
process_cmd
,
d_model
,
dropout_rate
,
name
):
super
(
PrePostProcessLayer
,
self
).
__init__
()
self
.
process_cmd
=
process_cmd
self
.
functors
=
[]
self
.
exec_order
=
""
for
cmd
in
self
.
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
self
.
functors
.
append
(
lambda
x
,
y
:
x
+
y
if
y
is
not
None
else
x
)
self
.
exec_order
+=
"a"
elif
cmd
==
"n"
:
# add layer normalization
self
.
functors
.
append
(
self
.
add_sublayer
(
"layer_norm_%d"
%
len
(
self
.
sublayers
(
include_sublayers
=
False
)),
LayerNorm
(
normalized_shape
=
d_model
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
"_layer_norm_scale"
,
initializer
=
fluid
.
initializer
.
Constant
(
1.
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
name
+
"_layer_norm_bias"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))))
self
.
exec_order
+=
"n"
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
self
.
functors
.
append
(
lambda
x
:
fluid
.
layers
.
dropout
(
x
,
dropout_prob
=
dropout_rate
,
is_test
=
False
))
self
.
exec_order
+=
"d"
def
forward
(
self
,
x
,
residual
=
None
):
for
i
,
cmd
in
enumerate
(
self
.
exec_order
):
if
cmd
==
"a"
:
x
=
self
.
functors
[
i
](
x
,
residual
)
else
:
x
=
self
.
functors
[
i
](
x
)
return
x
class
PositionwiseFeedForwardLayer
(
Layer
):
"""
PositionwiseFeedForwardLayer
"""
def
__init__
(
self
,
hidden_act
,
d_inner_hid
,
d_model
,
dropout_rate
,
param_initializer
=
None
,
name
=
""
):
super
(
PositionwiseFeedForwardLayer
,
self
).
__init__
()
self
.
_i2h
=
Linear
(
input_dim
=
d_model
,
output_dim
=
d_inner_hid
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_fc_0.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_fc_0.b_0'
,
act
=
hidden_act
)
self
.
_h2o
=
Linear
(
input_dim
=
d_inner_hid
,
output_dim
=
d_model
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_fc_1.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_fc_1.b_0'
)
self
.
_dropout_rate
=
dropout_rate
def
forward
(
self
,
x
):
"""
forward
:param x:
:return:
"""
hidden
=
self
.
_i2h
(
x
)
if
self
.
_dropout_rate
:
hidden
=
fluid
.
layers
.
dropout
(
hidden
,
dropout_prob
=
self
.
_dropout_rate
,
upscale_in_train
=
"upscale_in_train"
,
is_test
=
False
)
out
=
self
.
_h2o
(
hidden
)
return
out
class
MultiHeadAttentionLayer
(
Layer
):
"""
MultiHeadAttentionLayer
"""
def
__init__
(
self
,
d_key
,
d_value
,
d_model
,
n_head
=
1
,
dropout_rate
=
0.
,
cache
=
None
,
gather_idx
=
None
,
static_kv
=
False
,
param_initializer
=
None
,
name
=
""
):
super
(
MultiHeadAttentionLayer
,
self
).
__init__
()
self
.
_n_head
=
n_head
self
.
_d_key
=
d_key
self
.
_d_value
=
d_value
self
.
_d_model
=
d_model
self
.
_dropout_rate
=
dropout_rate
self
.
_q_fc
=
Linear
(
input_dim
=
d_model
,
output_dim
=
d_key
*
n_head
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_query_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_query_fc.b_0'
)
self
.
_k_fc
=
Linear
(
input_dim
=
d_model
,
output_dim
=
d_key
*
n_head
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_key_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_key_fc.b_0'
)
self
.
_v_fc
=
Linear
(
input_dim
=
d_model
,
output_dim
=
d_value
*
n_head
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_value_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_value_fc.b_0'
)
self
.
_proj_fc
=
Linear
(
input_dim
=
d_value
*
n_head
,
output_dim
=
d_model
,
param_attr
=
fluid
.
ParamAttr
(
name
=
name
+
'_output_fc.w_0'
,
initializer
=
param_initializer
),
bias_attr
=
name
+
'_output_fc.b_0'
)
def
forward
(
self
,
queries
,
keys
,
values
,
attn_bias
):
"""
forward
:param queries:
:param keys:
:param values:
:param attn_bias:
:return:
"""
# compute q ,k ,v
keys
=
queries
if
keys
is
None
else
keys
values
=
keys
if
values
is
None
else
values
q
=
self
.
_q_fc
(
queries
)
k
=
self
.
_k_fc
(
keys
)
v
=
self
.
_v_fc
(
values
)
# split head
q_hidden_size
=
q
.
shape
[
-
1
]
reshaped_q
=
fluid
.
layers
.
reshape
(
x
=
q
,
shape
=
[
0
,
0
,
self
.
_n_head
,
q_hidden_size
//
self
.
_n_head
],
inplace
=
False
)
transpose_q
=
fluid
.
layers
.
transpose
(
x
=
reshaped_q
,
perm
=
[
0
,
2
,
1
,
3
])
k_hidden_size
=
k
.
shape
[
-
1
]
reshaped_k
=
fluid
.
layers
.
reshape
(
x
=
k
,
shape
=
[
0
,
0
,
self
.
_n_head
,
k_hidden_size
//
self
.
_n_head
],
inplace
=
False
)
transpose_k
=
fluid
.
layers
.
transpose
(
x
=
reshaped_k
,
perm
=
[
0
,
2
,
1
,
3
])
v_hidden_size
=
v
.
shape
[
-
1
]
reshaped_v
=
fluid
.
layers
.
reshape
(
x
=
v
,
shape
=
[
0
,
0
,
self
.
_n_head
,
v_hidden_size
//
self
.
_n_head
],
inplace
=
False
)
transpose_v
=
fluid
.
layers
.
transpose
(
x
=
reshaped_v
,
perm
=
[
0
,
2
,
1
,
3
])
scaled_q
=
fluid
.
layers
.
scale
(
x
=
transpose_q
,
scale
=
self
.
_d_key
**-
0.5
)
# scale dot product attention
product
=
fluid
.
layers
.
matmul
(
#x=transpose_q,
x
=
scaled_q
,
y
=
transpose_k
,
transpose_y
=
True
)
#alpha=self._d_model**-0.5)
if
attn_bias
is
not
None
:
product
+=
attn_bias
weights
=
fluid
.
layers
.
softmax
(
product
)
if
self
.
_dropout_rate
:
weights_droped
=
fluid
.
layers
.
dropout
(
weights
,
dropout_prob
=
self
.
_dropout_rate
,
dropout_implementation
=
"upscale_in_train"
,
is_test
=
False
)
out
=
fluid
.
layers
.
matmul
(
weights_droped
,
transpose_v
)
else
:
out
=
fluid
.
layers
.
matmul
(
weights
,
transpose_v
)
# combine heads
if
len
(
out
.
shape
)
!=
4
:
raise
ValueError
(
"Input(x) should be a 4-D Tensor."
)
trans_x
=
fluid
.
layers
.
transpose
(
out
,
perm
=
[
0
,
2
,
1
,
3
])
final_out
=
fluid
.
layers
.
reshape
(
x
=
trans_x
,
shape
=
[
0
,
0
,
trans_x
.
shape
[
2
]
*
trans_x
.
shape
[
3
]],
inplace
=
False
)
# fc to output
proj_out
=
self
.
_proj_fc
(
final_out
)
return
proj_out
class
EncoderSubLayer
(
Layer
):
"""
EncoderSubLayer
"""
def
__init__
(
self
,
hidden_act
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
,
param_initializer
=
None
,
name
=
""
):
super
(
EncoderSubLayer
,
self
).
__init__
()
self
.
name
=
name
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_postprocess_cmd
=
postprocess_cmd
self
.
_prepostprocess_dropout
=
prepostprocess_dropout
self
.
_preprocess_layer
=
PrePostProcessLayer
(
self
.
_preprocess_cmd
,
d_model
,
prepostprocess_dropout
,
name
=
name
+
"_pre_att"
)
self
.
_multihead_attention_layer
=
MultiHeadAttentionLayer
(
d_key
,
d_value
,
d_model
,
n_head
,
attention_dropout
,
None
,
None
,
False
,
param_initializer
,
name
=
name
+
"_multi_head_att"
)
self
.
_postprocess_layer
=
PrePostProcessLayer
(
self
.
_postprocess_cmd
,
d_model
,
self
.
_prepostprocess_dropout
,
name
=
name
+
"_post_att"
)
self
.
_preprocess_layer2
=
PrePostProcessLayer
(
self
.
_preprocess_cmd
,
d_model
,
self
.
_prepostprocess_dropout
,
name
=
name
+
"_pre_ffn"
)
self
.
_positionwise_feed_forward
=
PositionwiseFeedForwardLayer
(
hidden_act
,
d_inner_hid
,
d_model
,
relu_dropout
,
param_initializer
,
name
=
name
+
"_ffn"
)
self
.
_postprocess_layer2
=
PrePostProcessLayer
(
self
.
_postprocess_cmd
,
d_model
,
self
.
_prepostprocess_dropout
,
name
=
name
+
"_post_ffn"
)
def
forward
(
self
,
enc_input
,
attn_bias
):
"""
forward
:param enc_input:
:param attn_bias:
:return:
"""
pre_process_multihead
=
self
.
_preprocess_layer
(
enc_input
)
attn_output
=
self
.
_multihead_attention_layer
(
pre_process_multihead
,
None
,
None
,
attn_bias
)
attn_output
=
self
.
_postprocess_layer
(
attn_output
,
enc_input
)
pre_process2_output
=
self
.
_preprocess_layer2
(
attn_output
)
ffd_output
=
self
.
_positionwise_feed_forward
(
pre_process2_output
)
return
self
.
_postprocess_layer2
(
ffd_output
,
attn_output
)
class
EncoderLayer
(
Layer
):
"""
encoder
"""
def
__init__
(
self
,
hidden_act
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
,
param_initializer
=
None
,
name
=
""
):
super
(
EncoderLayer
,
self
).
__init__
()
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_encoder_sublayers
=
list
()
self
.
_prepostprocess_dropout
=
prepostprocess_dropout
self
.
_n_layer
=
n_layer
self
.
_hidden_act
=
hidden_act
self
.
_preprocess_layer
=
PrePostProcessLayer
(
self
.
_preprocess_cmd
,
3
,
self
.
_prepostprocess_dropout
,
"post_encoder"
)
for
i
in
range
(
n_layer
):
self
.
_encoder_sublayers
.
append
(
self
.
add_sublayer
(
'esl_%d'
%
i
,
EncoderSubLayer
(
hidden_act
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
param_initializer
,
name
=
name
+
'_layer_'
+
str
(
i
))))
def
forward
(
self
,
enc_input
,
attn_bias
):
"""
forward
:param enc_input:
:param attn_bias:
:return:
"""
outputs
=
[]
for
i
in
range
(
self
.
_n_layer
):
enc_output
=
self
.
_encoder_sublayers
[
i
](
enc_input
,
attn_bias
)
outputs
.
append
(
enc_output
)
enc_input
=
enc_output
return
outputs
paddleslim/teachers/bert/optimization.py
0 → 100755
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.learning_rate_scheduler
import
LearningRateDecay
class
ConstantLR
(
LearningRateDecay
):
def
__init__
(
self
,
learning_rate
,
begin
=
0
,
step
=
1
,
dtype
=
'float32'
):
super
(
ConstantLR
,
self
).
__init__
(
begin
,
step
,
dtype
)
self
.
learning_rate
=
learning_rate
def
step
(
self
):
return
self
.
learning_rate
class
LinearDecay
(
LearningRateDecay
):
def
__init__
(
self
,
learning_rate
,
warmup_steps
,
decay_steps
,
end_learning_rate
=
0.0001
,
power
=
1.0
,
cycle
=
False
,
begin
=
0
,
step
=
1
,
dtype
=
'float32'
):
super
(
LinearDecay
,
self
).
__init__
(
begin
,
step
,
dtype
)
self
.
learning_rate
=
learning_rate
self
.
warmup_steps
=
warmup_steps
self
.
decay_steps
=
decay_steps
self
.
end_learning_rate
=
end_learning_rate
self
.
power
=
power
self
.
cycle
=
cycle
def
step
(
self
):
if
self
.
step_num
<
self
.
warmup_steps
:
decayed_lr
=
self
.
learning_rate
*
(
self
.
step_num
/
self
.
warmup_steps
)
decayed_lr
=
self
.
create_lr_var
(
decayed_lr
)
else
:
tmp_step_num
=
self
.
step_num
tmp_decay_steps
=
self
.
decay_steps
if
self
.
cycle
:
div_res
=
fluid
.
layers
.
ceil
(
self
.
create_lr_var
(
tmp_step_num
/
float
(
self
.
decay_steps
)))
if
tmp_step_num
==
0
:
div_res
=
self
.
create_lr_var
(
1.0
)
tmp_decay_steps
=
self
.
decay_steps
*
div_res
else
:
tmp_step_num
=
self
.
create_lr_var
(
tmp_step_num
if
tmp_step_num
<
self
.
decay_steps
else
self
.
decay_steps
)
decayed_lr
=
(
self
.
learning_rate
-
self
.
end_learning_rate
)
*
\
((
1
-
tmp_step_num
/
tmp_decay_steps
)
**
self
.
power
)
+
self
.
end_learning_rate
return
decayed_lr
class
Optimizer
(
object
):
def
__init__
(
self
,
warmup_steps
,
num_train_steps
,
learning_rate
,
model_cls
,
weight_decay
,
scheduler
=
'linear_warmup_decay'
,
loss_scaling
=
1.0
,
parameter_list
=
None
):
self
.
warmup_steps
=
warmup_steps
self
.
num_train_steps
=
num_train_steps
self
.
learning_rate
=
learning_rate
self
.
model_cls
=
model_cls
self
.
weight_decay
=
weight_decay
self
.
scheduler
=
scheduler
self
.
loss_scaling
=
loss_scaling
self
.
parameter_list
=
parameter_list
self
.
scheduled_lr
=
0.0
self
.
optimizer
=
self
.
lr_schedule
()
def
lr_schedule
(
self
):
if
self
.
warmup_steps
>
0
:
if
self
.
scheduler
==
'noam_decay'
:
self
.
scheduled_lr
=
fluid
.
dygraph
.
NoamDecay
(
1
/
(
self
.
warmup_steps
*
(
self
.
learning_rate
**
2
)),
self
.
warmup_steps
)
elif
self
.
scheduler
==
'linear_warmup_decay'
:
self
.
scheduled_lr
=
LinearDecay
(
self
.
learning_rate
,
self
.
warmup_steps
,
self
.
num_train_steps
,
0.0
)
else
:
raise
ValueError
(
"Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'"
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
self
.
scheduled_lr
,
parameter_list
=
self
.
parameter_list
)
else
:
self
.
scheduled_lr
=
ConstantLR
(
self
.
learning_rate
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
self
.
scheduled_lr
,
parameter_list
=
self
.
parameter_list
)
return
optimizer
def
exclude_from_weight_decay
(
self
,
name
):
if
name
.
find
(
"layer_norm"
)
>
-
1
:
return
True
bias_suffix
=
[
"_bias"
,
"_b"
,
".b_0"
]
for
suffix
in
bias_suffix
:
if
name
.
endswith
(
suffix
):
return
True
return
False
def
optimization
(
self
,
loss
,
use_data_parallel
=
False
,
model
=
None
):
param_list
=
dict
()
clip_norm_thres
=
1.0
#grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
if
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
loss
.
backward
()
if
self
.
weight_decay
>
0
:
for
param
in
self
.
model_cls
.
parameters
():
param_list
[
param
.
name
]
=
param
*
1.0
param_list
[
param
.
name
].
stop_gradient
=
True
if
use_data_parallel
:
assert
model
is
not
None
model
.
apply_collective_grads
()
#_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
_
,
param_grads
=
self
.
optimizer
.
minimize
(
loss
)
if
self
.
weight_decay
>
0
:
for
param
,
grad
in
param_grads
:
if
self
.
exclude_from_weight_decay
(
param
.
name
):
continue
if
isinstance
(
self
.
scheduled_lr
.
step
(),
float
):
updated_param
=
param
.
numpy
()
-
param_list
[
param
.
name
].
numpy
(
)
*
self
.
weight_decay
*
self
.
scheduled_lr
.
step
()
else
:
updated_param
=
param
.
numpy
(
)
-
param_list
[
param
.
name
].
numpy
(
)
*
self
.
weight_decay
*
self
.
scheduled_lr
.
step
().
numpy
()
updated_param_var
=
fluid
.
dygraph
.
to_variable
(
updated_param
)
param
=
updated_param_var
#param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
paddleslim/teachers/bert/reader/__init__.py
0 → 100644
浏览文件 @
e0df8292
paddleslim/teachers/bert/reader/cls.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
io
import
os
import
types
import
csv
import
numpy
as
np
import
tokenization
from
batching
import
prepare_batch_data
class
DataProcessor
(
object
):
"""Base class for data converters for sequence classification data sets."""
def
__init__
(
self
,
data_dir
,
vocab_path
,
max_seq_len
,
do_lower_case
,
in_tokens
,
random_seed
=
None
):
self
.
data_dir
=
data_dir
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
self
.
vocab
=
self
.
tokenizer
.
vocab
self
.
in_tokens
=
in_tokens
np
.
random
.
seed
(
random_seed
)
self
.
current_train_example
=
-
1
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
self
.
current_train_epoch
=
-
1
def
get_train_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the train set."""
raise
NotImplementedError
()
def
get_dev_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the dev set."""
raise
NotImplementedError
()
def
get_test_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for prediction."""
raise
NotImplementedError
()
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
raise
NotImplementedError
()
def
convert_example
(
self
,
index
,
example
,
labels
,
max_seq_len
,
tokenizer
):
"""Converts a single `InputExample` into a single `InputFeatures`."""
feature
=
convert_single_example
(
index
,
example
,
labels
,
max_seq_len
,
tokenizer
)
return
feature
def
generate_instance
(
self
,
feature
):
"""
generate instance with given feature
Args:
feature: InputFeatures(object). A single set of features of data.
"""
input_pos
=
list
(
range
(
len
(
feature
.
input_ids
)))
return
[
feature
.
input_ids
,
feature
.
segment_ids
,
input_pos
,
feature
.
label_id
]
def
generate_batch_data
(
self
,
batch_data
,
total_token_num
,
voc_size
=-
1
,
mask_id
=-
1
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
):
return
prepare_batch_data
(
batch_data
,
total_token_num
,
voc_size
=-
1
,
pad_id
=
self
.
vocab
[
"[PAD]"
],
cls_id
=
self
.
vocab
[
"[CLS]"
],
sep_id
=
self
.
vocab
[
"[SEP]"
],
mask_id
=-
1
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
@
classmethod
def
_read_tsv
(
cls
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"utf8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
lines
=
[]
for
line
in
reader
:
lines
.
append
(
line
)
return
lines
def
get_num_examples
(
self
,
phase
):
"""Get number of examples for train, dev or test."""
if
phase
not
in
[
'train'
,
'dev'
,
'test'
]:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'test']."
)
return
self
.
num_examples
[
phase
]
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_train_example
,
self
.
current_train_epoch
def
data_generator
(
self
,
batch_size
,
phase
=
'train'
,
epoch
=
1
,
dev_count
=
1
,
shuffle
=
True
,
shuffle_seed
=
None
):
"""
Generate data for train, dev or test.
Args:
batch_size: int. The batch size of generated data.
phase: string. The phase for which to generate data.
epoch: int. Total epoches to generate data.
shuffle: bool. Whether to shuffle examples.
"""
if
phase
==
'train'
:
examples
=
self
.
get_train_examples
(
self
.
data_dir
)
self
.
num_examples
[
'train'
]
=
len
(
examples
)
elif
phase
==
'dev'
:
examples
=
self
.
get_dev_examples
(
self
.
data_dir
)
self
.
num_examples
[
'dev'
]
=
len
(
examples
)
elif
phase
==
'test'
:
examples
=
self
.
get_test_examples
(
self
.
data_dir
)
self
.
num_examples
[
'test'
]
=
len
(
examples
)
else
:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'test']."
)
def
instance_reader
():
for
epoch_index
in
range
(
epoch
):
if
shuffle
:
if
shuffle_seed
is
not
None
:
np
.
random
.
seed
(
shuffle_seed
)
np
.
random
.
shuffle
(
examples
)
if
phase
==
'train'
:
self
.
current_train_epoch
=
epoch_index
for
(
index
,
example
)
in
enumerate
(
examples
):
if
phase
==
'train'
:
self
.
current_train_example
=
index
+
1
feature
=
self
.
convert_example
(
index
,
example
,
self
.
get_labels
(),
self
.
max_seq_len
,
self
.
tokenizer
)
instance
=
self
.
generate_instance
(
feature
)
yield
instance
def
batch_reader
(
reader
,
batch_size
,
in_tokens
):
batch
,
total_token_num
,
max_len
=
[],
0
,
0
for
instance
in
reader
():
token_ids
,
sent_ids
,
pos_ids
,
label
=
instance
[:
4
]
max_len
=
max
(
max_len
,
len
(
token_ids
))
if
in_tokens
:
to_append
=
(
len
(
batch
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch
)
<
batch_size
if
to_append
:
batch
.
append
(
instance
)
total_token_num
+=
len
(
token_ids
)
else
:
yield
batch
,
total_token_num
batch
,
total_token_num
,
max_len
=
[
instance
],
len
(
token_ids
),
len
(
token_ids
)
if
len
(
batch
)
>
0
:
yield
batch
,
total_token_num
def
wrapper
():
all_dev_batches
=
[]
for
batch_data
,
total_token_num
in
batch_reader
(
instance_reader
,
batch_size
,
self
.
in_tokens
):
batch_data
=
self
.
generate_batch_data
(
batch_data
,
total_token_num
,
voc_size
=-
1
,
mask_id
=-
1
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
if
len
(
all_dev_batches
)
<
dev_count
:
all_dev_batches
.
append
(
batch_data
)
if
len
(
all_dev_batches
)
==
dev_count
:
for
batch
in
all_dev_batches
:
yield
batch
all_dev_batches
=
[]
return
wrapper
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self
.
guid
=
guid
self
.
text_a
=
text_a
self
.
text_b
=
text_b
self
.
label
=
label
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
class
InputFeatures
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
input_ids
,
input_mask
,
segment_ids
,
label_id
):
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
label_id
=
label_id
class
XnliProcessor
(
DataProcessor
):
"""Processor for the XNLI data set."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
self
.
language
=
"zh"
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"multinli"
,
"multinli.train.%s.tsv"
%
self
.
language
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"train-%d"
%
(
i
)
text_a
=
tokenization
.
convert_to_unicode
(
line
[
0
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
1
])
label
=
tokenization
.
convert_to_unicode
(
line
[
2
])
if
label
==
tokenization
.
convert_to_unicode
(
"contradictory"
):
label
=
tokenization
.
convert_to_unicode
(
"contradiction"
)
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
self
.
language
=
"zh"
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"xnli.dev.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"dev-%d"
%
(
i
)
language
=
tokenization
.
convert_to_unicode
(
line
[
0
])
if
language
!=
tokenization
.
convert_to_unicode
(
self
.
language
):
continue
text_a
=
tokenization
.
convert_to_unicode
(
line
[
6
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
7
])
label
=
tokenization
.
convert_to_unicode
(
line
[
1
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
self
.
language
=
"zh"
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"xnli.test.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"test-%d"
%
(
i
)
language
=
tokenization
.
convert_to_unicode
(
line
[
0
])
if
language
!=
tokenization
.
convert_to_unicode
(
self
.
language
):
continue
text_a
=
tokenization
.
convert_to_unicode
(
line
[
6
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
7
])
label
=
tokenization
.
convert_to_unicode
(
line
[
1
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_labels
(
self
):
"""See base class."""
return
[
"contradiction"
,
"entailment"
,
"neutral"
]
class
MnliProcessor
(
DataProcessor
):
"""Processor for the MultiNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev_matched.tsv"
)),
"dev_matched"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test_matched.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"contradiction"
,
"entailment"
,
"neutral"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
tokenization
.
convert_to_unicode
(
line
[
0
]))
text_a
=
tokenization
.
convert_to_unicode
(
line
[
8
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
9
])
if
set_type
==
"test"
:
label
=
"contradiction"
else
:
label
=
tokenization
.
convert_to_unicode
(
line
[
-
1
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
MrpcProcessor
(
DataProcessor
):
"""Processor for the MRPC data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
tokenization
.
convert_to_unicode
(
line
[
3
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
4
])
if
set_type
==
"test"
:
label
=
"0"
else
:
label
=
tokenization
.
convert_to_unicode
(
line
[
0
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
ColaProcessor
(
DataProcessor
):
"""Processor for the CoLA data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
# Only the test set has a header
if
set_type
==
"test"
and
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
if
set_type
==
"test"
:
text_a
=
tokenization
.
convert_to_unicode
(
line
[
1
])
label
=
"0"
else
:
text_a
=
tokenization
.
convert_to_unicode
(
line
[
3
])
label
=
tokenization
.
convert_to_unicode
(
line
[
1
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
def
convert_single_example_to_unicode
(
guid
,
single_example
):
text_a
=
tokenization
.
convert_to_unicode
(
single_example
[
0
])
text_b
=
tokenization
.
convert_to_unicode
(
single_example
[
1
])
label
=
tokenization
.
convert_to_unicode
(
single_example
[
2
])
return
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
)
def
convert_single_example
(
ex_index
,
example
,
label_list
,
max_seq_length
,
tokenizer
):
"""Converts a single `InputExample` into a single `InputFeatures`."""
label_map
=
{}
for
(
i
,
label
)
in
enumerate
(
label_list
):
label_map
[
label
]
=
i
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
if
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
label_id
=
label_map
[
example
.
label
]
feature
=
InputFeatures
(
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
label_id
=
label_id
)
return
feature
def
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
print
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
feature
=
convert_single_example
(
ex_index
,
example
,
label_list
,
max_seq_length
,
tokenizer
)
features
.
append
(
feature
)
return
features
if
__name__
==
'__main__'
:
pass
paddleslim/teachers/bert/reader/pretraining.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
division
import
os
import
numpy
as
np
import
types
import
gzip
import
logging
import
re
import
six
import
collections
import
tokenization
import
paddle
import
paddle.fluid
as
fluid
from
batching
import
prepare_batch_data
class
DataReader
(
object
):
def
__init__
(
self
,
data_dir
,
vocab_path
,
batch_size
=
4096
,
in_tokens
=
True
,
max_seq_len
=
512
,
shuffle_files
=
True
,
epoch
=
100
,
voc_size
=
0
,
is_test
=
False
,
generate_neg_sample
=
False
):
self
.
vocab
=
self
.
load_vocab
(
vocab_path
)
self
.
data_dir
=
data_dir
self
.
batch_size
=
batch_size
self
.
in_tokens
=
in_tokens
self
.
shuffle_files
=
shuffle_files
self
.
epoch
=
epoch
self
.
current_epoch
=
0
self
.
current_file_index
=
0
self
.
total_file
=
0
self
.
current_file
=
None
self
.
voc_size
=
voc_size
self
.
max_seq_len
=
max_seq_len
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
mask_id
=
self
.
vocab
[
"[MASK]"
]
self
.
is_test
=
is_test
self
.
generate_neg_sample
=
generate_neg_sample
if
self
.
in_tokens
:
assert
self
.
batch_size
>=
self
.
max_seq_len
,
"The number of "
\
"tokens in batch should not be smaller than max seq length."
if
self
.
is_test
:
self
.
epoch
=
1
self
.
shuffle_files
=
False
def
get_progress
(
self
):
"""return current progress of traning data
"""
return
self
.
current_epoch
,
self
.
current_file_index
,
self
.
total_file
,
self
.
current_file
def
parse_line
(
self
,
line
,
max_seq_len
=
512
):
""" parse one line to token_ids, sentence_ids, pos_ids, label
"""
line
=
line
.
strip
().
decode
().
split
(
";"
)
assert
len
(
line
)
==
4
,
"One sample must have 4 fields!"
(
token_ids
,
sent_ids
,
pos_ids
,
label
)
=
line
token_ids
=
[
int
(
token
)
for
token
in
token_ids
.
split
(
" "
)]
sent_ids
=
[
int
(
token
)
for
token
in
sent_ids
.
split
(
" "
)]
pos_ids
=
[
int
(
token
)
for
token
in
pos_ids
.
split
(
" "
)]
assert
len
(
token_ids
)
==
len
(
sent_ids
)
==
len
(
pos_ids
),
"[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
label
=
int
(
label
)
if
len
(
token_ids
)
>
max_seq_len
:
return
None
return
[
token_ids
,
sent_ids
,
pos_ids
,
label
]
def
read_file
(
self
,
file
):
assert
file
.
endswith
(
'.gz'
),
"[ERROR] %s is not a gzip file"
%
file
file_path
=
self
.
data_dir
+
"/"
+
file
with
gzip
.
open
(
file_path
,
"rb"
)
as
f
:
for
line
in
f
:
parsed_line
=
self
.
parse_line
(
line
,
max_seq_len
=
self
.
max_seq_len
)
if
parsed_line
is
None
:
continue
yield
parsed_line
def
convert_to_unicode
(
self
,
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
elif
isinstance
(
text
,
unicode
):
return
text
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
load_vocab
(
self
,
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
fin
=
open
(
vocab_file
)
for
num
,
line
in
enumerate
(
fin
):
items
=
self
.
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
break
token
=
items
[
0
]
index
=
items
[
1
]
if
len
(
items
)
==
2
else
num
token
=
token
.
strip
()
vocab
[
token
]
=
int
(
index
)
return
vocab
def
random_pair_neg_samples
(
self
,
pos_samples
):
""" randomly generate negtive samples using pos_samples
Args:
pos_samples: list of positive samples
Returns:
neg_samples: list of negtive samples
"""
np
.
random
.
shuffle
(
pos_samples
)
num_sample
=
len
(
pos_samples
)
neg_samples
=
[]
miss_num
=
0
for
i
in
range
(
num_sample
):
pair_index
=
(
i
+
1
)
%
num_sample
origin_src_ids
=
pos_samples
[
i
][
0
]
origin_sep_index
=
origin_src_ids
.
index
(
2
)
pair_src_ids
=
pos_samples
[
pair_index
][
0
]
pair_sep_index
=
pair_src_ids
.
index
(
2
)
src_ids
=
origin_src_ids
[:
origin_sep_index
+
1
]
+
pair_src_ids
[
pair_sep_index
+
1
:]
if
len
(
src_ids
)
>=
self
.
max_seq_len
:
miss_num
+=
1
continue
sent_ids
=
[
0
]
*
len
(
origin_src_ids
[:
origin_sep_index
+
1
])
+
[
1
]
*
len
(
pair_src_ids
[
pair_sep_index
+
1
:])
pos_ids
=
list
(
range
(
len
(
src_ids
)))
neg_sample
=
[
src_ids
,
sent_ids
,
pos_ids
,
0
]
assert
len
(
src_ids
)
==
len
(
sent_ids
)
==
len
(
pos_ids
),
"[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
neg_samples
.
append
(
neg_sample
)
return
neg_samples
,
miss_num
def
mixin_negtive_samples
(
self
,
pos_sample_generator
,
buffer
=
1000
):
""" 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
2. combine negtive samples and positive samples
Args:
pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
Returns:
sample: one sample from shuffled positive samples and negtive samples
"""
pos_samples
=
[]
num_total_miss
=
0
pos_sample_num
=
0
try
:
while
True
:
while
len
(
pos_samples
)
<
buffer
:
pos_sample
=
next
(
pos_sample_generator
)
label
=
pos_sample
[
3
]
assert
label
==
1
,
"positive sample's label must be 1"
pos_samples
.
append
(
pos_sample
)
pos_sample_num
+=
1
neg_samples
,
miss_num
=
self
.
random_pair_neg_samples
(
pos_samples
)
num_total_miss
+=
miss_num
samples
=
pos_samples
+
neg_samples
pos_samples
=
[]
np
.
random
.
shuffle
(
samples
)
for
sample
in
samples
:
yield
sample
except
StopIteration
:
print
(
"stopiteration: reach end of file"
)
if
len
(
pos_samples
)
==
1
:
yield
pos_samples
[
0
]
elif
len
(
pos_samples
)
==
0
:
yield
None
else
:
neg_samples
,
miss_num
=
self
.
random_pair_neg_samples
(
pos_samples
)
num_total_miss
+=
miss_num
samples
=
pos_samples
+
neg_samples
pos_samples
=
[]
np
.
random
.
shuffle
(
samples
)
for
sample
in
samples
:
yield
sample
print
(
"miss_num:%d
\t
ideal_total_sample_num:%d
\t
miss_rate:%f"
%
(
num_total_miss
,
pos_sample_num
*
2
,
num_total_miss
/
(
pos_sample_num
*
2
)))
def
data_generator
(
self
):
"""
data_generator
"""
files
=
os
.
listdir
(
self
.
data_dir
)
self
.
total_file
=
len
(
files
)
assert
self
.
total_file
>
0
,
"[Error] data_dir is empty"
def
wrapper
():
def
reader
():
for
epoch
in
range
(
self
.
epoch
):
self
.
current_epoch
=
epoch
+
1
if
self
.
shuffle_files
:
np
.
random
.
shuffle
(
files
)
for
index
,
file
in
enumerate
(
files
):
self
.
current_file_index
=
index
+
1
self
.
current_file
=
file
sample_generator
=
self
.
read_file
(
file
)
if
not
self
.
is_test
and
self
.
generate_neg_sample
:
sample_generator
=
self
.
mixin_negtive_samples
(
sample_generator
)
for
sample
in
sample_generator
:
if
sample
is
None
:
continue
yield
sample
def
batch_reader
(
reader
,
batch_size
,
in_tokens
):
batch
,
total_token_num
,
max_len
=
[],
0
,
0
for
parsed_line
in
reader
():
token_ids
,
sent_ids
,
pos_ids
,
label
=
parsed_line
max_len
=
max
(
max_len
,
len
(
token_ids
))
if
in_tokens
:
to_append
=
(
len
(
batch
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch
)
<
batch_size
if
to_append
:
batch
.
append
(
parsed_line
)
total_token_num
+=
len
(
token_ids
)
else
:
yield
batch
,
total_token_num
batch
,
total_token_num
,
max_len
=
[
parsed_line
],
len
(
token_ids
),
len
(
token_ids
)
if
len
(
batch
)
>
0
:
yield
batch
,
total_token_num
for
batch_data
,
total_token_num
in
batch_reader
(
reader
,
self
.
batch_size
,
self
.
in_tokens
):
yield
prepare_batch_data
(
batch_data
,
total_token_num
,
voc_size
=
self
.
voc_size
,
pad_id
=
self
.
pad_id
,
cls_id
=
self
.
cls_id
,
sep_id
=
self
.
sep_id
,
mask_id
=
self
.
mask_id
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
return
wrapper
if
__name__
==
"__main__"
:
pass
paddleslim/teachers/bert/reader/squad.py
0 → 100644
浏览文件 @
e0df8292
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
import
six
import
math
import
json
import
random
import
collections
import
tokenization
from
batching
import
prepare_batch_data
class
SquadExample
(
object
):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def
__init__
(
self
,
qas_id
,
question_text
,
doc_tokens
,
orig_answer_text
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
False
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
self
.
doc_tokens
=
doc_tokens
self
.
orig_answer_text
=
orig_answer_text
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
__str__
(
self
):
return
self
.
__repr__
()
def
__repr__
(
self
):
s
=
""
s
+=
"qas_id: %s"
%
(
tokenization
.
printable_text
(
self
.
qas_id
))
s
+=
", question_text: %s"
%
(
tokenization
.
printable_text
(
self
.
question_text
))
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
if
self
.
start_position
:
s
+=
", start_position: %d"
%
(
self
.
start_position
)
if
self
.
start_position
:
s
+=
", end_position: %d"
%
(
self
.
end_position
)
if
self
.
start_position
:
s
+=
", is_impossible: %r"
%
(
self
.
is_impossible
)
return
s
class
InputFeatures
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
unique_id
,
example_index
,
doc_span_index
,
tokens
,
token_to_orig_map
,
token_is_max_context
,
input_ids
,
input_mask
,
segment_ids
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
self
.
unique_id
=
unique_id
self
.
example_index
=
example_index
self
.
doc_span_index
=
doc_span_index
self
.
tokens
=
tokens
self
.
token_to_orig_map
=
token_to_orig_map
self
.
token_is_max_context
=
token_is_max_context
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
=
False
):
"""Read a SQuAD json file into a list of SquadExample."""
with
open
(
input_file
,
"r"
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
examples
=
[]
for
entry
in
input_data
:
for
paragraph
in
entry
[
"paragraphs"
]:
paragraph_text
=
paragraph
[
"context"
]
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
for
c
in
paragraph_text
:
if
is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
for
qa
in
paragraph
[
"qas"
]:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position
=
None
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
is_training
:
if
version_2_with_negative
:
is_impossible
=
qa
[
"is_impossible"
]
if
(
len
(
qa
[
"answers"
])
!=
1
)
and
(
not
is_impossible
):
raise
ValueError
(
"For training, each question should have exactly 1 answer."
)
if
not
is_impossible
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
start_position
=
char_to_word_offset
[
answer_offset
]
end_position
=
char_to_word_offset
[
answer_offset
+
answer_length
-
1
]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
tokenization
.
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
print
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
else
:
start_position
=
-
1
end_position
=
-
1
orig_answer_text
=
""
example
=
SquadExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
is_impossible
)
examples
.
append
(
example
)
return
examples
def
convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
#output_fn
):
"""Loads a data file into a list of `InputBatch`s."""
unique_id
=
1000000000
for
(
example_index
,
example
)
in
enumerate
(
examples
):
query_tokens
=
tokenizer
.
tokenize
(
example
.
question_text
)
if
len
(
query_tokens
)
>
max_query_length
:
query_tokens
=
query_tokens
[
0
:
max_query_length
]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
all_doc_tokens
=
[]
for
(
i
,
token
)
in
enumerate
(
example
.
doc_tokens
):
orig_to_tok_index
.
append
(
len
(
all_doc_tokens
))
sub_tokens
=
tokenizer
.
tokenize
(
token
)
for
sub_token
in
sub_tokens
:
tok_to_orig_index
.
append
(
i
)
all_doc_tokens
.
append
(
sub_token
)
tok_start_position
=
None
tok_end_position
=
None
if
is_training
and
example
.
is_impossible
:
tok_start_position
=
-
1
tok_end_position
=
-
1
if
is_training
and
not
example
.
is_impossible
:
tok_start_position
=
orig_to_tok_index
[
example
.
start_position
]
if
example
.
end_position
<
len
(
example
.
doc_tokens
)
-
1
:
tok_end_position
=
orig_to_tok_index
[
example
.
end_position
+
1
]
-
1
else
:
tok_end_position
=
len
(
all_doc_tokens
)
-
1
(
tok_start_position
,
tok_end_position
)
=
_improve_answer_span
(
all_doc_tokens
,
tok_start_position
,
tok_end_position
,
tokenizer
,
example
.
orig_answer_text
)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc
=
max_seq_length
-
len
(
query_tokens
)
-
3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"DocSpan"
,
[
"start"
,
"length"
])
doc_spans
=
[]
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
length
=
max_tokens_for_doc
doc_spans
.
append
(
_DocSpan
(
start
=
start_offset
,
length
=
length
))
if
start_offset
+
length
==
len
(
all_doc_tokens
):
break
start_offset
+=
min
(
length
,
doc_stride
)
for
(
doc_span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
tokens
=
[]
token_to_orig_map
=
{}
token_is_max_context
=
{}
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
start_position
=
None
end_position
=
None
if
is_training
and
not
example
.
is_impossible
:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
0
end_position
=
0
else
:
doc_offset
=
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
if
is_training
and
example
.
is_impossible
:
start_position
=
0
end_position
=
0
"""
if example_index < 3:
print("*** Example ***")
print("unique_id: %s" % (unique_id))
print("example_index: %s" % (example_index))
print("doc_span_index: %s" % (doc_span_index))
print("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
print("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
print("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
print("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible:
print("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
print("start_position: %d" % (start_position))
print("end_position: %d" % (end_position))
print("answer: %s" %
(tokenization.printable_text(answer_text)))
"""
feature
=
InputFeatures
(
unique_id
=
unique_id
,
example_index
=
example_index
,
doc_span_index
=
doc_span_index
,
tokens
=
tokens
,
token_to_orig_map
=
token_to_orig_map
,
token_is_max_context
=
token_is_max_context
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
example
.
is_impossible
)
unique_id
+=
1
yield
feature
def
_improve_answer_span
(
doc_tokens
,
input_start
,
input_end
,
tokenizer
,
orig_answer_text
):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_answer_text
))
for
new_start
in
range
(
input_start
,
input_end
+
1
):
for
new_end
in
range
(
input_end
,
new_start
-
1
,
-
1
):
text_span
=
" "
.
join
(
doc_tokens
[
new_start
:(
new_end
+
1
)])
if
text_span
==
tok_answer_text
:
return
(
new_start
,
new_end
)
return
(
input_start
,
input_end
)
def
_check_is_max_context
(
doc_spans
,
cur_span_index
,
position
):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score
=
None
best_span_index
=
None
for
(
span_index
,
doc_span
)
in
enumerate
(
doc_spans
):
end
=
doc_span
.
start
+
doc_span
.
length
-
1
if
position
<
doc_span
.
start
:
continue
if
position
>
end
:
continue
num_left_context
=
position
-
doc_span
.
start
num_right_context
=
end
-
position
score
=
min
(
num_left_context
,
num_right_context
)
+
0.01
*
doc_span
.
length
if
best_score
is
None
or
score
>
best_score
:
best_score
=
score
best_span_index
=
span_index
return
cur_span_index
==
best_span_index
class
DataProcessor
(
object
):
def
__init__
(
self
,
vocab_path
,
do_lower_case
,
max_seq_length
,
in_tokens
,
doc_stride
,
max_query_length
):
self
.
_tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
self
.
_max_seq_length
=
max_seq_length
self
.
_doc_stride
=
doc_stride
self
.
_max_query_length
=
max_query_length
self
.
_in_tokens
=
in_tokens
self
.
vocab
=
self
.
_tokenizer
.
vocab
self
.
vocab_size
=
len
(
self
.
vocab
)
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
mask_id
=
self
.
vocab
[
"[MASK]"
]
self
.
current_train_example
=
-
1
self
.
num_train_examples
=
-
1
self
.
current_train_epoch
=
-
1
self
.
train_examples
=
None
self
.
predict_examples
=
None
self
.
num_examples
=
{
'train'
:
-
1
,
'predict'
:
-
1
}
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_train_example
,
self
.
current_train_epoch
def
get_examples
(
self
,
data_path
,
is_training
,
version_2_with_negative
=
False
):
examples
=
read_squad_examples
(
input_file
=
data_path
,
is_training
=
is_training
,
version_2_with_negative
=
version_2_with_negative
)
return
examples
def
get_num_examples
(
self
,
phase
):
if
phase
not
in
[
'train'
,
'predict'
]:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'predict']."
)
return
self
.
num_examples
[
phase
]
def
get_features
(
self
,
examples
,
is_training
):
features
=
convert_examples_to_features
(
examples
=
examples
,
tokenizer
=
self
.
_tokenizer
,
max_seq_length
=
self
.
_max_seq_length
,
doc_stride
=
self
.
_doc_stride
,
max_query_length
=
self
.
_max_query_length
,
is_training
=
is_training
)
return
features
def
data_generator
(
self
,
data_path
,
batch_size
,
phase
=
'train'
,
shuffle
=
False
,
dev_count
=
1
,
version_2_with_negative
=
False
,
epoch
=
1
):
if
phase
==
'train'
:
self
.
train_examples
=
self
.
get_examples
(
data_path
,
is_training
=
True
,
version_2_with_negative
=
version_2_with_negative
)
examples
=
self
.
train_examples
self
.
num_examples
[
'train'
]
=
len
(
self
.
train_examples
)
elif
phase
==
'predict'
:
self
.
predict_examples
=
self
.
get_examples
(
data_path
,
is_training
=
False
,
version_2_with_negative
=
version_2_with_negative
)
examples
=
self
.
predict_examples
self
.
num_examples
[
'predict'
]
=
len
(
self
.
predict_examples
)
else
:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'predict']."
)
def
batch_reader
(
features
,
batch_size
,
in_tokens
):
batch
,
total_token_num
,
max_len
=
[],
0
,
0
for
(
index
,
feature
)
in
enumerate
(
features
):
if
phase
==
'train'
:
self
.
current_train_example
=
index
+
1
seq_len
=
len
(
feature
.
input_ids
)
labels
=
[
feature
.
unique_id
]
if
feature
.
start_position
is
None
else
[
feature
.
start_position
,
feature
.
end_position
]
example
=
[
feature
.
input_ids
,
feature
.
segment_ids
,
range
(
seq_len
)
]
+
labels
max_len
=
max
(
max_len
,
seq_len
)
#max_len = max(max_len, len(token_ids))
if
in_tokens
:
to_append
=
(
len
(
batch
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch
)
<
batch_size
if
to_append
:
batch
.
append
(
example
)
total_token_num
+=
seq_len
else
:
yield
batch
,
total_token_num
batch
,
total_token_num
,
max_len
=
[
example
],
seq_len
,
seq_len
if
len
(
batch
)
>
0
:
yield
batch
,
total_token_num
def
wrapper
():
for
epoch_index
in
range
(
epoch
):
if
shuffle
:
random
.
shuffle
(
examples
)
if
phase
==
'train'
:
self
.
current_train_epoch
=
epoch_index
features
=
self
.
get_features
(
examples
,
is_training
=
True
)
else
:
features
=
self
.
get_features
(
examples
,
is_training
=
False
)
all_dev_batches
=
[]
for
batch_data
,
total_token_num
in
batch_reader
(
features
,
batch_size
,
self
.
_in_tokens
):
batch_data
=
prepare_batch_data
(
batch_data
,
total_token_num
,
voc_size
=-
1
,
pad_id
=
self
.
pad_id
,
cls_id
=
self
.
cls_id
,
sep_id
=
self
.
sep_id
,
mask_id
=-
1
,
return_input_mask
=
True
,
return_max_len
=
False
,
return_num_token
=
False
)
if
len
(
all_dev_batches
)
<
dev_count
:
all_dev_batches
.
append
(
batch_data
)
if
len
(
all_dev_batches
)
==
dev_count
:
for
batch
in
all_dev_batches
:
yield
batch
all_dev_batches
=
[]
return
wrapper
def
write_predictions
(
all_examples
,
all_features
,
all_results
,
n_best_size
,
max_answer_length
,
do_lower_case
,
output_prediction_file
,
output_nbest_file
,
output_null_log_odds_file
,
version_2_with_negative
,
null_score_diff_threshold
,
verbose
):
"""Write final predictions to the json file and log-odds of null if needed."""
print
(
"Writing predictions to: %s"
%
(
output_prediction_file
))
print
(
"Writing nbest to: %s"
%
(
output_nbest_file
))
example_index_to_features
=
collections
.
defaultdict
(
list
)
for
feature
in
all_features
:
example_index_to_features
[
feature
.
example_index
].
append
(
feature
)
unique_id_to_result
=
{}
for
result
in
all_results
:
unique_id_to_result
[
result
.
unique_id
]
=
result
_PrelimPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"PrelimPrediction"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
"start_logit"
,
"end_logit"
])
all_predictions
=
collections
.
OrderedDict
()
all_nbest_json
=
collections
.
OrderedDict
()
scores_diff_json
=
collections
.
OrderedDict
()
for
(
example_index
,
example
)
in
enumerate
(
all_examples
):
features
=
example_index_to_features
[
example_index
]
prelim_predictions
=
[]
# keep track of the minimum score of null start+end of position 0
score_null
=
1000000
# large and positive
min_null_feature_index
=
0
# the paragraph slice with min mull score
null_start_logit
=
0
# the start logit at the slice with min null score
null_end_logit
=
0
# the end logit at the slice with min null score
for
(
feature_index
,
feature
)
in
enumerate
(
features
):
result
=
unique_id_to_result
[
feature
.
unique_id
]
start_indexes
=
_get_best_indexes
(
result
.
start_logits
,
n_best_size
)
end_indexes
=
_get_best_indexes
(
result
.
end_logits
,
n_best_size
)
# if we could have irrelevant answers, get the min score of irrelevant
if
version_2_with_negative
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
feature_null_score
<
score_null
:
score_null
=
feature_null_score
min_null_feature_index
=
feature_index
null_start_logit
=
result
.
start_logits
[
0
]
null_end_logit
=
result
.
end_logits
[
0
]
for
start_index
in
start_indexes
:
for
end_index
in
end_indexes
:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
>=
len
(
feature
.
tokens
):
continue
if
end_index
>=
len
(
feature
.
tokens
):
continue
if
start_index
not
in
feature
.
token_to_orig_map
:
continue
if
end_index
not
in
feature
.
token_to_orig_map
:
continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
,
end_index
=
end_index
,
start_logit
=
result
.
start_logits
[
start_index
],
end_logit
=
result
.
end_logits
[
end_index
]))
if
version_2_with_negative
:
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
min_null_feature_index
,
start_index
=
0
,
end_index
=
0
,
start_logit
=
null_start_logit
,
end_logit
=
null_end_logit
))
prelim_predictions
=
sorted
(
prelim_predictions
,
key
=
lambda
x
:
(
x
.
start_logit
+
x
.
end_logit
),
reverse
=
True
)
_NbestPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"NbestPrediction"
,
[
"text"
,
"start_logit"
,
"end_logit"
])
seen_predictions
=
{}
nbest
=
[]
for
pred
in
prelim_predictions
:
if
len
(
nbest
)
>=
n_best_size
:
break
feature
=
features
[
pred
.
feature_index
]
if
pred
.
start_index
>
0
:
# this is a non-null prediction
tok_tokens
=
feature
.
tokens
[
pred
.
start_index
:(
pred
.
end_index
+
1
)]
orig_doc_start
=
feature
.
token_to_orig_map
[
pred
.
start_index
]
orig_doc_end
=
feature
.
token_to_orig_map
[
pred
.
end_index
]
orig_tokens
=
example
.
doc_tokens
[
orig_doc_start
:(
orig_doc_end
+
1
)]
tok_text
=
" "
.
join
(
tok_tokens
)
# De-tokenize WordPieces that have been split off.
tok_text
=
tok_text
.
replace
(
" ##"
,
""
)
tok_text
=
tok_text
.
replace
(
"##"
,
""
)
# Clean whitespace
tok_text
=
tok_text
.
strip
()
tok_text
=
" "
.
join
(
tok_text
.
split
())
orig_text
=
" "
.
join
(
orig_tokens
)
final_text
=
get_final_text
(
tok_text
,
orig_text
,
do_lower_case
,
verbose
)
if
final_text
in
seen_predictions
:
continue
seen_predictions
[
final_text
]
=
True
else
:
final_text
=
""
seen_predictions
[
final_text
]
=
True
nbest
.
append
(
_NbestPrediction
(
text
=
final_text
,
start_logit
=
pred
.
start_logit
,
end_logit
=
pred
.
end_logit
))
# if we didn't inlude the empty option in the n-best, inlcude it
if
version_2_with_negative
:
if
""
not
in
seen_predictions
:
nbest
.
append
(
_NbestPrediction
(
text
=
""
,
start_logit
=
null_start_logit
,
end_logit
=
null_end_logit
))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if
not
nbest
:
nbest
.
append
(
_NbestPrediction
(
text
=
"empty"
,
start_logit
=
0.0
,
end_logit
=
0.0
))
assert
len
(
nbest
)
>=
1
total_scores
=
[]
best_non_null_entry
=
None
for
entry
in
nbest
:
total_scores
.
append
(
entry
.
start_logit
+
entry
.
end_logit
)
if
not
best_non_null_entry
:
if
entry
.
text
:
best_non_null_entry
=
entry
# debug
if
best_non_null_entry
is
None
:
print
(
"Emmm..., sth wrong"
)
probs
=
_compute_softmax
(
total_scores
)
nbest_json
=
[]
for
(
i
,
entry
)
in
enumerate
(
nbest
):
output
=
collections
.
OrderedDict
()
output
[
"text"
]
=
entry
.
text
output
[
"probability"
]
=
probs
[
i
]
output
[
"start_logit"
]
=
entry
.
start_logit
output
[
"end_logit"
]
=
entry
.
end_logit
nbest_json
.
append
(
output
)
assert
len
(
nbest_json
)
>=
1
if
not
version_2_with_negative
:
all_predictions
[
example
.
qas_id
]
=
nbest_json
[
0
][
"text"
]
else
:
# predict "" iff the null score - the score of best non-null > threshold
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
all_nbest_json
[
example
.
qas_id
]
=
nbest_json
with
open
(
output_prediction_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
all_predictions
,
indent
=
4
)
+
"
\n
"
)
with
open
(
output_nbest_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
all_nbest_json
,
indent
=
4
)
+
"
\n
"
)
if
version_2_with_negative
:
with
open
(
output_null_log_odds_file
,
"w"
)
as
writer
:
writer
.
write
(
json
.
dumps
(
scores_diff_json
,
indent
=
4
)
+
"
\n
"
)
def
get_final_text
(
pred_text
,
orig_text
,
do_lower_case
,
verbose
):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def
_strip_spaces
(
text
):
ns_chars
=
[]
ns_to_s_map
=
collections
.
OrderedDict
()
for
(
i
,
c
)
in
enumerate
(
text
):
if
c
==
" "
:
continue
ns_to_s_map
[
len
(
ns_chars
)]
=
i
ns_chars
.
append
(
c
)
ns_text
=
""
.
join
(
ns_chars
)
return
(
ns_text
,
ns_to_s_map
)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer
=
tokenization
.
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
start_position
=
tok_text
.
find
(
pred_text
)
if
start_position
==
-
1
:
if
verbose
:
print
(
"Unable to find text: '%s' in '%s'"
%
(
pred_text
,
orig_text
))
return
orig_text
end_position
=
start_position
+
len
(
pred_text
)
-
1
(
orig_ns_text
,
orig_ns_to_s_map
)
=
_strip_spaces
(
orig_text
)
(
tok_ns_text
,
tok_ns_to_s_map
)
=
_strip_spaces
(
tok_text
)
if
len
(
orig_ns_text
)
!=
len
(
tok_ns_text
):
if
verbose
:
print
(
"Length not equal after stripping spaces: '%s' vs '%s'"
,
orig_ns_text
,
tok_ns_text
)
return
orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map
=
{}
for
(
i
,
tok_index
)
in
six
.
iteritems
(
tok_ns_to_s_map
):
tok_s_to_ns_map
[
tok_index
]
=
i
orig_start_position
=
None
if
start_position
in
tok_s_to_ns_map
:
ns_start_position
=
tok_s_to_ns_map
[
start_position
]
if
ns_start_position
in
orig_ns_to_s_map
:
orig_start_position
=
orig_ns_to_s_map
[
ns_start_position
]
if
orig_start_position
is
None
:
if
verbose
:
print
(
"Couldn't map start position"
)
return
orig_text
orig_end_position
=
None
if
end_position
in
tok_s_to_ns_map
:
ns_end_position
=
tok_s_to_ns_map
[
end_position
]
if
ns_end_position
in
orig_ns_to_s_map
:
orig_end_position
=
orig_ns_to_s_map
[
ns_end_position
]
if
orig_end_position
is
None
:
if
verbose
:
print
(
"Couldn't map end position"
)
return
orig_text
output_text
=
orig_text
[
orig_start_position
:(
orig_end_position
+
1
)]
return
output_text
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_compute_softmax
(
scores
):
"""Compute softmax probability over raw logits."""
if
not
scores
:
return
[]
max_score
=
None
for
score
in
scores
:
if
max_score
is
None
or
score
>
max_score
:
max_score
=
score
exp_scores
=
[]
total_sum
=
0.0
for
score
in
scores
:
x
=
math
.
exp
(
score
-
max_score
)
exp_scores
.
append
(
x
)
total_sum
+=
x
probs
=
[]
for
score
in
exp_scores
:
probs
.
append
(
score
/
total_sum
)
return
probs
if
__name__
==
'__main__'
:
train_file
=
'squad/train-v1.1.json'
vocab_file
=
'uncased_L-12_H-768_A-12/vocab.txt'
do_lower_case
=
True
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_file
,
do_lower_case
=
do_lower_case
)
train_examples
=
read_squad_examples
(
input_file
=
train_file
,
is_training
=
True
)
print
(
"begin converting"
)
for
(
index
,
feature
)
in
enumerate
(
convert_examples_to_features
(
examples
=
train_examples
,
tokenizer
=
tokenizer
,
max_seq_length
=
384
,
doc_stride
=
128
,
max_query_length
=
64
,
is_training
=
True
,
#output_fn=train_writer.process_feature
)):
if
index
<
10
:
print
(
index
,
feature
.
input_ids
,
feature
.
input_mask
,
feature
.
segment_ids
)
#for (index, example) in enumerate(train_examples):
# if index < 5:
# print(example)
paddleslim/teachers/bert/tokenization.py
0 → 100644
浏览文件 @
e0df8292
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
unicodedata
import
six
import
io
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
elif
isinstance
(
text
,
unicode
):
return
text
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
fin
=
io
.
open
(
vocab_file
,
encoding
=
"utf8"
)
for
num
,
line
in
enumerate
(
fin
):
items
=
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
break
token
=
items
[
0
]
index
=
items
[
1
]
if
len
(
items
)
==
2
else
num
token
=
token
.
strip
()
vocab
[
token
]
=
int
(
index
)
return
vocab
def
convert_by_vocab
(
vocab
,
items
):
"""Converts a sequence of [tokens|ids] using the vocab."""
output
=
[]
for
item
in
items
:
output
.
append
(
vocab
[
item
])
return
output
def
convert_tokens_to_ids
(
vocab
,
tokens
):
return
convert_by_vocab
(
vocab
,
tokens
)
def
convert_ids_to_tokens
(
inv_vocab
,
ids
):
return
convert_by_vocab
(
inv_vocab
,
ids
)
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
FullTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
CharTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
text
.
lower
().
split
(
" "
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self
.
do_lower_case
=
do_lower_case
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
if
len
(
chars
)
>
self
.
max_input_chars_per_word
:
output_tokens
.
append
(
self
.
unk_token
)
continue
is_bad
=
False
start
=
0
sub_tokens
=
[]
while
start
<
len
(
chars
):
end
=
len
(
chars
)
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
break
end
-=
1
if
cur_substr
is
None
:
is_bad
=
True
break
sub_tokens
.
append
(
cur_substr
)
start
=
end
if
is_bad
:
output_tokens
.
append
(
self
.
unk_token
)
else
:
output_tokens
.
extend
(
sub_tokens
)
return
output_tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"C"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
cp
>=
33
and
cp
<=
47
)
or
(
cp
>=
58
and
cp
<=
64
)
or
(
cp
>=
91
and
cp
<=
96
)
or
(
cp
>=
123
and
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
paddleslim/teachers/bert/utils/__init__.py
0 → 100644
浏览文件 @
e0df8292
paddleslim/teachers/bert/utils/convert_static_to_dygraph.py
0 → 100755
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
shutil
import
sys
import
os
def
usage
():
"""
usage information
"""
print
print
(
"please use command: "
)
print
(
"python convert_static_to_dygraph.py input_params_dir output_params_dir"
)
print
def
convert_static_to_dygraph
(
static_model_path
,
dygraph_model_path
):
"""
convert paddle static bert model to dygraph model
"""
def
mkdir
(
path
):
if
not
os
.
path
.
isdir
(
path
):
if
os
.
path
.
split
(
path
)[
0
]:
mkdir
(
os
.
path
.
split
(
path
)[
0
])
else
:
return
os
.
mkdir
(
path
)
if
os
.
path
.
exists
(
dygraph_model_path
):
shutil
.
rmtree
(
dygraph_model_path
)
mkdir
(
dygraph_model_path
)
if
not
os
.
path
.
exists
(
static_model_path
):
print
(
"paddle static model path doesn't exist....."
)
return
-
1
file_list
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
static_model_path
):
file_list
.
extend
(
files
)
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/PrePostProcessLayer_0"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"
))
#os.chdir(static_model_path)
#convert embedding file
embedding_type
=
[
"word"
,
"pos"
,
"sent"
]
for
i
in
range
(
3
):
src_name
=
embedding_type
[
i
]
+
"_embedding"
trg_name
=
"Embedding_"
+
str
(
i
)
+
"."
+
src_name
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
src_name
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
trg_name
))
#convert pre_encoder file
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"pre_encoder_layer_norm_scale"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"pre_encoder_layer_norm_bias"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
#convert mask lm params file
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"mask_lm_out_fc.b_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"mask_lm_trans_fc.b_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"mask_lm_trans_fc.w_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"mask_lm_trans_layer_norm_bias"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"mask_lm_trans_layer_norm_scale"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"next_sent_fc.b_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/FC_1.next_sent_fc.b_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"next_sent_fc.w_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/FC_1.next_sent_fc.w_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"pooled_fc.b_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"
))
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
"pooled_fc.w_0"
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"
))
encoder_num
=
0
for
f
in
file_list
:
if
not
f
.
startswith
(
"encoder_layer"
):
continue
layer_num
=
f
.
split
(
'_'
)[
2
]
if
int
(
layer_num
)
>
encoder_num
:
encoder_num
=
int
(
layer_num
)
encoder_num
+=
1
for
i
in
range
(
encoder_num
):
encoder_dir
=
"EncoderSubLayer_"
+
str
(
i
)
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
"EncoderLayer_0/"
,
encoder_dir
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
"EncoderLayer_0/"
,
encoder_dir
+
"/PositionwiseFeedForwardLayer_0"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
"EncoderLayer_0/"
,
encoder_dir
+
"/MultiHeadAttentionLayer_0"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
"EncoderLayer_0/"
,
encoder_dir
+
"/PrePostProcessLayer_1"
))
os
.
makedirs
(
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/"
+
"EncoderLayer_0/"
,
encoder_dir
+
"/PrePostProcessLayer_3"
))
encoder_map_dict
=
{
"ffn_fc_0.b_0"
:
(
"PositionwiseFeedForwardLayer_0"
,
"FC_0.ffn_fc_0.b_0"
),
"ffn_fc_0.w_0"
:
(
"PositionwiseFeedForwardLayer_0"
,
"FC_0.ffn_fc_0.w_0"
),
"ffn_fc_1.b_0"
:
(
"PositionwiseFeedForwardLayer_0"
,
"FC_1.ffn_fc_1.b_0"
),
"ffn_fc_1.w_0"
:
(
"PositionwiseFeedForwardLayer_0"
,
"FC_1.ffn_fc_1.w_0"
),
"multi_head_att_key_fc.b_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_1.key_fc.b_0"
),
"multi_head_att_key_fc.w_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_1.key_fc.w_0"
),
"multi_head_att_output_fc.b_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_3.output_fc.b_0"
),
"multi_head_att_output_fc.w_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_3.output_fc.w_0"
),
"multi_head_att_query_fc.b_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_0.query_fc.b_0"
),
"multi_head_att_query_fc.w_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_0.query_fc.w_0"
),
"multi_head_att_value_fc.b_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_2.value_fc.b_0"
),
"multi_head_att_value_fc.w_0"
:
(
"MultiHeadAttentionLayer_0"
,
"FC_2.value_fc.w_0"
),
"post_att_layer_norm_bias"
:
(
"PrePostProcessLayer_1"
,
"LayerNorm_0.post_att_layer_norm_bias"
),
"post_att_layer_norm_scale"
:
(
"PrePostProcessLayer_1"
,
"LayerNorm_0.post_att_layer_norm_scale"
),
"post_ffn_layer_norm_bias"
:
(
"PrePostProcessLayer_3"
,
"LayerNorm_0.post_ffn_layer_norm_bias"
),
"post_ffn_layer_norm_scale"
:
(
"PrePostProcessLayer_3"
,
"LayerNorm_0.post_ffn_layer_norm_scale"
)
}
for
f
in
file_list
:
if
not
f
.
startswith
(
"encoder_layer"
):
continue
layer_num
=
f
.
split
(
'_'
)[
2
]
suffix_name
=
"_"
.
join
(
f
.
split
(
'_'
)[
3
:])
in_dir
=
encoder_map_dict
[
suffix_name
][
0
]
rename
=
encoder_map_dict
[
suffix_name
][
1
]
encoder_layer
=
"EncoderSubLayer_"
+
layer_num
shutil
.
copyfile
(
os
.
path
.
join
(
static_model_path
,
f
),
os
.
path
.
join
(
dygraph_model_path
,
"PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/"
+
encoder_layer
+
"/"
+
in_dir
+
"/"
+
rename
))
if
__name__
==
"__main__"
:
if
len
(
sys
.
argv
)
<
3
:
usage
()
exit
(
1
)
static_model_path
=
sys
.
argv
[
1
]
dygraph_model_path
=
sys
.
argv
[
2
]
convert_static_to_dygraph
(
static_model_path
,
dygraph_model_path
)
paddleslim/teachers/bert/utils/fp16.py
0 → 100644
浏览文件 @
e0df8292
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
def
cast_fp16_to_fp32
(
i
,
o
,
prog
):
prog
.
global_block
().
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
i
},
outputs
=
{
"Out"
:
o
},
attrs
=
{
"in_dtype"
:
fluid
.
core
.
VarDesc
.
VarType
.
FP16
,
"out_dtype"
:
fluid
.
core
.
VarDesc
.
VarType
.
FP32
})
def
cast_fp32_to_fp16
(
i
,
o
,
prog
):
prog
.
global_block
().
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
i
},
outputs
=
{
"Out"
:
o
},
attrs
=
{
"in_dtype"
:
fluid
.
core
.
VarDesc
.
VarType
.
FP32
,
"out_dtype"
:
fluid
.
core
.
VarDesc
.
VarType
.
FP16
})
def
copy_to_master_param
(
p
,
block
):
v
=
block
.
vars
.
get
(
p
.
name
,
None
)
if
v
is
None
:
raise
ValueError
(
"no param name %s found!"
%
p
.
name
)
new_p
=
fluid
.
framework
.
Parameter
(
block
=
block
,
shape
=
v
.
shape
,
dtype
=
fluid
.
core
.
VarDesc
.
VarType
.
FP32
,
type
=
v
.
type
,
lod_level
=
v
.
lod_level
,
stop_gradient
=
p
.
stop_gradient
,
trainable
=
p
.
trainable
,
optimize_attr
=
p
.
optimize_attr
,
regularizer
=
p
.
regularizer
,
gradient_clip_attr
=
p
.
gradient_clip_attr
,
error_clip
=
p
.
error_clip
,
name
=
v
.
name
+
".master"
)
return
new_p
def
create_master_params_grads
(
params_grads
,
main_prog
,
startup_prog
,
loss_scaling
):
master_params_grads
=
[]
tmp_role
=
main_prog
.
_current_role
OpRole
=
fluid
.
core
.
op_proto_and_checker_maker
.
OpRole
main_prog
.
_current_role
=
OpRole
.
Backward
for
p
,
g
in
params_grads
:
# create master parameters
master_param
=
copy_to_master_param
(
p
,
main_prog
.
global_block
())
startup_master_param
=
startup_prog
.
global_block
().
_clone_variable
(
master_param
)
startup_p
=
startup_prog
.
global_block
().
var
(
p
.
name
)
cast_fp16_to_fp32
(
startup_p
,
startup_master_param
,
startup_prog
)
# cast fp16 gradients to fp32 before apply gradients
if
g
.
name
.
find
(
"layer_norm"
)
>
-
1
:
if
loss_scaling
>
1
:
scaled_g
=
g
/
float
(
loss_scaling
)
else
:
scaled_g
=
g
master_params_grads
.
append
([
p
,
scaled_g
])
continue
master_grad
=
fluid
.
layers
.
cast
(
g
,
"float32"
)
if
loss_scaling
>
1
:
master_grad
=
master_grad
/
float
(
loss_scaling
)
master_params_grads
.
append
([
master_param
,
master_grad
])
main_prog
.
_current_role
=
tmp_role
return
master_params_grads
def
master_param_to_train_param
(
master_params_grads
,
params_grads
,
main_prog
):
for
idx
,
m_p_g
in
enumerate
(
master_params_grads
):
train_p
,
_
=
params_grads
[
idx
]
if
train_p
.
name
.
find
(
"layer_norm"
)
>
-
1
:
continue
with
main_prog
.
_optimized_guard
([
m_p_g
[
0
],
m_p_g
[
1
]]):
cast_fp32_to_fp16
(
m_p_g
[
0
],
train_p
,
main_prog
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录