Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSlim
提交
5ed02755
P
PaddleSlim
项目概览
PaddlePaddle
/
PaddleSlim
大约 2 年 前同步成功
通知
51
Star
1434
Fork
344
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
16
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSlim
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
16
合并请求
16
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5ed02755
编写于
4月 17, 2020
作者:
W
wanghaoshuang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Normal the efficient loss and kd loss.
上级
51044022
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
89 addition
and
38 deletion
+89
-38
demo/bert/search_bert.py
demo/bert/search_bert.py
+13
-6
paddleslim/nas/darts/search_space/conv_bert/cls.py
paddleslim/nas/darts/search_space/conv_bert/cls.py
+32
-11
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
+6
-0
paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
...darts/search_space/conv_bert/model/transformer_encoder.py
+23
-10
paddleslim/nas/darts/train_search.py
paddleslim/nas/darts/train_search.py
+6
-5
paddleslim/teachers/bert/cls.py
paddleslim/teachers/bert/cls.py
+9
-6
未找到文件。
demo/bert/search_bert.py
浏览文件 @
5ed02755
...
...
@@ -14,7 +14,7 @@ def main():
max_seq_len
=
512
do_lower_case
=
True
batch_size
=
32
epoch
=
3
epoch
=
3
0
processor
=
MnliProcessor
(
data_dir
=
data_dir
,
...
...
@@ -23,8 +23,6 @@ def main():
do_lower_case
=
do_lower_case
,
in_tokens
=
False
)
valid_reader
=
processor
.
data_generator
(
batch_size
=
batch_size
,
phase
=
'dev'
,
epoch
=
epoch
,
shuffle
=
False
)
train_reader
=
processor
.
data_generator
(
batch_size
=
batch_size
,
phase
=
'train'
,
...
...
@@ -32,13 +30,22 @@ def main():
dev_count
=
1
,
shuffle
=
True
)
val_reader
=
processor
.
data_generator
(
batch_size
=
batch_size
,
phase
=
'train'
,
epoch
=
epoch
,
dev_count
=
1
,
shuffle
=
True
)
with
fluid
.
dygraph
.
guard
(
place
):
model
=
AdaBERTClassifier
(
3
)
model
=
AdaBERTClassifier
(
3
,
teacher_model
=
"/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000"
)
searcher
=
DARTSearch
(
model
,
train_reader
,
valid_reader
,
learning_rate
=
0.001
,
val_reader
,
batchsize
=
batch_size
,
num_epochs
=
epoch
,
log_freq
=
10
)
...
...
paddleslim/nas/darts/search_space/conv_bert/cls.py
浏览文件 @
5ed02755
...
...
@@ -41,12 +41,20 @@ __all__ = ["AdaBERTClassifier"]
class
AdaBERTClassifier
(
Layer
):
def
__init__
(
self
,
num_labels
,
n_layer
=
12
,
emb_size
=
768
):
def
__init__
(
self
,
num_labels
,
n_layer
=
8
,
emb_size
=
768
,
teacher_model
=
None
):
super
(
AdaBERTClassifier
,
self
).
__init__
()
self
.
_n_layer
=
n_layer
self
.
_num_labels
=
num_labels
self
.
_emb_size
=
emb_size
self
.
teacher
=
BERTClassifier
(
num_labels
)
print
(
"----------------------load teacher model and test----------------------------------------"
)
self
.
teacher
=
BERTClassifier
(
num_labels
,
model_path
=
teacher_model
)
# self.teacher.test("/work/PaddleSlim/demo/bert/data/glue_data/MNLI/")
print
(
"----------------------finish load teacher model and test----------------------------------------"
)
self
.
student
=
BertModelLayer
(
n_layer
=
self
.
_n_layer
,
emb_size
=
self
.
_emb_size
)
...
...
@@ -76,7 +84,7 @@ class AdaBERTClassifier(Layer):
def
genotype
(
self
):
return
self
.
arch_parameters
()
def
loss
(
self
,
data_ids
,
beta
=
0.5
,
gamma
=
0.5
):
def
loss
(
self
,
data_ids
,
beta
=
4
,
gamma
=
0.8
):
T
=
1.0
src_ids
=
data_ids
[
0
]
position_ids
=
data_ids
[
1
]
...
...
@@ -98,10 +106,20 @@ class AdaBERTClassifier(Layer):
# define kd loss
kd_losses
=
[]
kd_weights
=
[]
for
i
in
range
(
len
(
next_sent_feats
)):
j
=
int
(
np
.
ceil
(
i
*
(
float
(
len
(
t_logits
))
/
len
(
next_sent_feats
))))
kd_weights
.
append
(
t_losses
[
j
].
numpy
())
kd_weights
=
1
/
np
.
array
(
kd_weights
)
kd_weights
=
np
.
exp
(
kd_weights
-
np
.
max
(
kd_weights
))
kd_weights
=
kd_weights
/
kd_weights
.
sum
(
axis
=
0
)
for
i
in
range
(
len
(
next_sent_feats
)):
j
=
np
.
ceil
(
i
*
(
len
(
next_sent_feats
)
/
len
(
logits
)))
j
=
int
(
np
.
ceil
(
i
*
(
float
(
len
(
t_logits
))
/
len
(
next_sent_feats
)
)))
t_logit
=
t_logits
[
j
]
t_loss
=
t_losses
[
j
]
s_sent_feat
=
next_sent_feats
[
i
]
fc
=
self
.
cls_fc
[
i
]
s_sent_feat
=
fluid
.
layers
.
dropout
(
...
...
@@ -115,22 +133,25 @@ class AdaBERTClassifier(Layer):
t_probs
.
stop_gradient
=
False
kd_loss
=
t_probs
*
fluid
.
layers
.
log
(
s_probs
/
T
)
kd_loss
=
fluid
.
layers
.
reduce_sum
(
kd_loss
,
dim
=
1
)
kd_loss
=
fluid
.
layers
.
reduce_mean
(
kd_loss
,
dim
=
0
)
kd_loss
=
kd_loss
/
t_loss
kd_loss
=
kd_loss
*
kd_weights
[
i
]
kd_losses
.
append
(
kd_loss
)
kd_loss
=
fluid
.
layers
.
sum
(
kd_losses
)
kd_loss
=
fluid
.
layers
.
reduce_mean
(
kd_loss
,
dim
=
0
)
# define ce loss
ce_loss
=
fluid
.
layers
.
cross_entropy
(
s_probs
,
labels
)
ce_loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
*
k_i
ce_loss
=
fluid
.
layers
.
reduce_mean
(
ce_loss
)
*
k_i
# define e loss
model_size
=
fluid
.
layers
.
sum
(
model_size
)
flops
=
fluid
.
layers
.
sum
(
flops
)
model_size
=
fluid
.
layers
.
sum
(
model_size
)
/
self
.
student
.
max_model_size
()
flops
=
fluid
.
layers
.
sum
(
flops
)
/
self
.
student
.
max_flops
()
e_loss
=
(
len
(
next_sent_feats
)
*
k_i
/
self
.
_n_layer
)
*
(
flops
+
model_size
)
# define total loss
loss
=
(
1
-
gamma
)
*
ce_loss
-
gamma
*
kd_loss
+
beta
*
e_loss
print
(
"ce_loss: {}; kd_loss: {}; e_loss: {}"
.
format
((
1
-
gamma
)
*
ce_loss
.
numpy
(),
-
gamma
*
kd_loss
.
numpy
(),
beta
*
e_loss
.
numpy
()))
return
loss
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
浏览文件 @
5ed02755
...
...
@@ -82,6 +82,12 @@ class BertModelLayer(Layer):
self
.
_encoder
=
EncoderLayer
(
n_layer
=
self
.
_n_layer
,
d_model
=
self
.
_emb_size
)
def
max_flops
(
self
):
return
self
.
_encoder
.
max_flops
def
max_model_size
(
self
):
return
self
.
_encoder
.
max_model_size
def
arch_parameters
(
self
):
return
[
self
.
_encoder
.
alphas
]
...
...
paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
浏览文件 @
5ed02755
...
...
@@ -64,8 +64,8 @@ OPS = {
'dil_conv_3'
:
lambda
:
ConvBN
(
1
,
1
,
filter_size
=
3
,
dilation
=
2
),
'dil_conv_5'
:
lambda
:
ConvBN
(
1
,
1
,
filter_size
=
5
,
dilation
=
2
),
'dil_conv_7'
:
lambda
:
ConvBN
(
1
,
1
,
filter_size
=
7
,
dilation
=
2
),
'avg_pool_3'
:
lambda
:
Pool2D
(
pool_size
=
(
3
,
1
),
pool_type
=
'avg'
),
'max_pool_3'
:
lambda
:
Pool2D
(
pool_size
=
(
3
,
1
),
pool_type
=
'max'
),
'avg_pool_3'
:
lambda
:
Pool2D
(
pool_size
=
(
3
,
1
),
pool_
padding
=
(
1
,
0
),
pool_
type
=
'avg'
),
'max_pool_3'
:
lambda
:
Pool2D
(
pool_size
=
(
3
,
1
),
pool_
padding
=
(
1
,
0
),
pool_
type
=
'max'
),
'none'
:
lambda
:
Zero
(),
'skip_connect'
:
lambda
:
Identity
(),
}
...
...
@@ -76,10 +76,13 @@ class MixedOp(fluid.dygraph.Layer):
super
(
MixedOp
,
self
).
__init__
()
ops
=
[
OPS
[
primitive
]()
for
primitive
in
PRIMITIVES
]
self
.
_ops
=
fluid
.
dygraph
.
LayerList
(
ops
)
self
.
max_flops
=
max
([
FLOPs
[
primitive
]
for
primitive
in
PRIMITIVES
])
self
.
max_model_size
=
max
(
[
ModelSize
[
primitive
]
for
primitive
in
PRIMITIVES
])
def
forward
(
self
,
x
,
weights
,
flops
=
[],
model_size
=
[]):
for
i
in
range
(
len
(
self
.
_ops
)):
if
weights
[
i
]
!=
0
:
if
weights
[
i
]
.
numpy
()
!=
0
:
flops
.
append
(
FLOPs
.
values
()[
i
]
*
weights
[
i
])
model_size
.
append
(
ModelSize
.
values
()[
i
]
*
weights
[
i
])
return
self
.
_ops
[
i
](
x
)
*
weights
[
i
]
...
...
@@ -135,8 +138,8 @@ class ConvBN(fluid.dygraph.Layer):
self
.
conv_layer
=
Conv2D
(
in_ch
,
out_ch
,
[
filter_size
,
1
],
dilation
=
dilation
,
padding
=
[(
filter_size
-
1
)
//
2
,
0
],
dilation
=
[
dilation
,
1
]
,
padding
=
[(
filter_size
-
1
)
*
dilation
//
2
,
0
],
param_attr
=
conv_param
,
bias_attr
=
False
,
act
=
None
,
...
...
@@ -154,10 +157,14 @@ class Cell(fluid.dygraph.Layer):
super
(
Cell
,
self
).
__init__
()
self
.
_steps
=
steps
self
.
max_flops
=
0
self
.
max_model_size
=
0
ops
=
[]
for
i
in
range
(
self
.
_steps
):
for
j
in
range
(
2
+
i
):
op
=
MixedOp
()
self
.
max_flops
+=
op
.
max_flops
self
.
max_model_size
+=
op
.
max_model_size
ops
.
append
(
op
)
self
.
_ops
=
fluid
.
dygraph
.
LayerList
(
ops
)
...
...
@@ -191,10 +198,16 @@ class EncoderLayer(Layer):
self
.
_n_layer
=
n_layer
self
.
_d_model
=
d_model
self
.
_steps
=
3
self
.
max_flops
=
0
self
.
max_model_size
=
0
cells
=
[]
for
i
in
range
(
n_layer
):
cells
.
append
(
Cell
(
steps
=
self
.
_steps
))
cell
=
Cell
(
steps
=
self
.
_steps
)
cells
.
append
(
cell
)
self
.
max_flops
+=
cell
.
max_flops
self
.
max_model_size
+=
cell
.
max_model_size
self
.
_cells
=
fluid
.
dygraph
.
LayerList
(
cells
)
k
=
sum
(
1
for
i
in
range
(
self
.
_steps
)
for
n
in
range
(
2
+
i
))
...
...
@@ -222,7 +235,7 @@ class EncoderLayer(Layer):
[
-
1
,
1
,
enc_input
.
shape
[
1
],
self
.
_d_model
])
alphas
=
gumbel_softmax
(
self
.
alphas
)
k
=
gumbel_softmax
(
self
.
k
)
k
=
fluid
.
layers
.
reshape
(
gumbel_softmax
(
self
.
k
),
[
-
1
]
)
outputs
=
[]
s0
=
s1
=
tmp
...
...
@@ -235,7 +248,7 @@ class EncoderLayer(Layer):
enc_output
=
fluid
.
layers
.
reshape
(
s1
,
[
-
1
,
enc_input
.
shape
[
1
],
self
.
_d_model
])
outputs
.
append
(
enc_output
)
if
k
[
i
]
!=
0
:
if
k
[
i
]
.
numpy
()
!=
0
:
outputs
[
-
1
]
=
outputs
[
-
1
]
*
k
[
i
]
break
return
outputs
,
k
[
i
]
return
outputs
,
k
[
i
]
return
None
paddleslim/nas/darts/train_search.py
浏览文件 @
5ed02755
...
...
@@ -19,6 +19,7 @@ from __future__ import print_function
__all__
=
[
'DARTSearch'
]
import
logging
from
itertools
import
izip
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.base
import
to_variable
...
...
@@ -75,10 +76,8 @@ class DARTSearch(object):
objs
=
AvgrageMeter
()
self
.
model
.
train
()
for
step_id
,
(
train_data
,
valid_data
)
in
enumerate
(
zip
(
train_loader
(),
valid_loader
())):
step_id
=
0
for
train_data
,
valid_data
in
izip
(
train_loader
(),
valid_loader
()):
if
epoch
>=
self
.
epochs_no_archopt
:
architect
.
step
(
train_data
,
valid_data
)
...
...
@@ -95,11 +94,13 @@ class DARTSearch(object):
optimizer
.
minimize
(
loss
,
grad_clip
)
self
.
model
.
clear_gradients
()
objs
.
update
(
loss
.
numpy
(),
self
.
batchsize
)
batch_size
=
train_data
[
0
].
shape
[
0
]
objs
.
update
(
loss
.
numpy
(),
batch_size
)
if
step_id
%
self
.
log_freq
==
0
:
logger
.
info
(
"Train Epoch {}, Step {}, loss {:.6f}"
.
format
(
epoch
,
step_id
,
objs
.
avg
[
0
]))
step_id
+=
1
return
objs
.
avg
[
0
]
def
valid_one_epoch
(
self
,
valid_loader
,
epoch
):
...
...
paddleslim/teachers/bert/cls.py
浏览文件 @
5ed02755
...
...
@@ -86,17 +86,20 @@ class BERTClassifier(Layer):
self
.
cls_model
=
ClsModelLayer
(
self
.
bert_config
,
num_labels
,
return_pooled_out
=
True
)
if
self
.
init_pretraining_params
:
print
(
"Load pre-trained model from %s"
%
self
.
init_pretraining_params
)
init_from_static_model
(
self
.
init_pretraining_params
,
self
.
cls_model
,
self
.
bert_config
)
if
model_path
is
not
None
:
#restore the model
print
(
"Load params from %s"
%
model_path
)
model_dict
,
_
=
fluid
.
load_dygraph
(
model_path
)
self
.
cls_model
.
load_dict
(
model_dict
)
elif
self
.
init_pretraining_params
:
print
(
"Load pre-trained model from %s"
%
self
.
init_pretraining_params
)
init_from_static_model
(
self
.
init_pretraining_params
,
self
.
cls_model
,
self
.
bert_config
)
else
:
raise
Exception
(
"You should load pretrained model for training this teacher model."
)
def
forward
(
self
,
input
):
return
self
.
cls_model
(
input
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录