Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSlim
提交
70c6b708
P
PaddleSlim
项目概览
PaddlePaddle
/
PaddleSlim
大约 1 年 前同步成功
通知
51
Star
1434
Fork
344
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
16
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSlim
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
16
合并请求
16
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
70c6b708
编写于
5月 19, 2020
作者:
W
wanghaoshuang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Update bert based DARTS demo
上级
d3e51bfc
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
198 addition
and
100 deletion
+198
-100
paddleslim/common/meter.py
paddleslim/common/meter.py
+5
-1
paddleslim/nas/darts/architect.py
paddleslim/nas/darts/architect.py
+3
-0
paddleslim/nas/darts/search_space/conv_bert/cls.py
paddleslim/nas/darts/search_space/conv_bert/cls.py
+89
-23
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
+4
-6
paddleslim/nas/darts/search_space/conv_bert/model/cls.py
paddleslim/nas/darts/search_space/conv_bert/model/cls.py
+4
-4
paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
...darts/search_space/conv_bert/model/transformer_encoder.py
+40
-22
paddleslim/nas/darts/train_search.py
paddleslim/nas/darts/train_search.py
+47
-38
paddleslim/teachers/bert/model/transformer_encoder.py
paddleslim/teachers/bert/model/transformer_encoder.py
+6
-6
未找到文件。
paddleslim/common/meter.py
浏览文件 @
70c6b708
...
...
@@ -16,8 +16,9 @@ __all__ = ['AvgrageMeter']
class
AvgrageMeter
(
object
):
def
__init__
(
self
):
def
__init__
(
self
,
format
=
"{}"
):
self
.
reset
()
self
.
_format
=
format
def
reset
(
self
):
self
.
avg
=
0
...
...
@@ -28,3 +29,6 @@ class AvgrageMeter(object):
self
.
sum
+=
val
*
n
self
.
cnt
+=
n
self
.
avg
=
self
.
sum
/
self
.
cnt
def
__repr__
(
self
):
return
self
.
_format
.
format
(
self
.
avg
)
paddleslim/nas/darts/architect.py
浏览文件 @
70c6b708
...
...
@@ -56,7 +56,10 @@ class Architect(object):
else
:
loss
=
self
.
_backward_step
(
valid_data
)
self
.
optimizer
.
minimize
(
loss
)
# print("alphas gradient: {}".format(self.model.arch_parameters()[0].gradient()))
self
.
optimizer
.
clear_gradients
()
return
self
.
model
.
arch_parameters
()[
0
].
gradient
()
def
_backward_step
(
self
,
valid_data
):
loss
=
self
.
model
.
loss
(
valid_data
)
...
...
paddleslim/nas/darts/search_space/conv_bert/cls.py
浏览文件 @
70c6b708
...
...
@@ -50,7 +50,9 @@ class AdaBERTClassifier(Layer):
beta
=
4
,
conv_type
=
"conv_bn"
,
search_layer
=
True
,
teacher_model
=
None
):
teacher_model
=
None
,
alphas
=
None
,
k
=
None
):
super
(
AdaBERTClassifier
,
self
).
__init__
()
self
.
_n_layer
=
n_layer
self
.
_num_labels
=
num_labels
...
...
@@ -60,6 +62,8 @@ class AdaBERTClassifier(Layer):
self
.
_beta
=
beta
self
.
_conv_type
=
conv_type
self
.
_search_layer
=
search_layer
self
.
_alphas
=
alphas
self
.
_k
=
k
print
(
"----------------------load teacher model and test----------------------------------------"
)
...
...
@@ -86,20 +90,36 @@ class AdaBERTClassifier(Layer):
bias_attr
=
fluid
.
ParamAttr
(
name
=
"s_cls_out_%d_b"
%
i
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
fc
=
self
.
add_sublayer
(
"cls_fc_%d"
%
i
,
fc
)
fc
=
self
.
add_sublayer
(
"
s_
cls_fc_%d"
%
i
,
fc
)
self
.
cls_fc
.
append
(
fc
)
def
forward
(
self
,
data_ids
):
def
forward
(
self
,
data_ids
,
alphas
=
None
,
k
=
None
):
src_ids
=
data_ids
[
0
]
position_ids
=
data_ids
[
1
]
sentence_ids
=
data_ids
[
2
]
return
self
.
student
(
src_ids
,
position_ids
,
sentence_ids
)
return
self
.
student
(
src_ids
,
position_ids
,
sentence_ids
,
alphas
=
self
.
_alphas
,
k
=
self
.
_k
)
def
arch_parameters
(
self
):
return
self
.
student
.
arch_parameters
()
def
model_parameters
(
self
):
model_parameters
=
[
p
for
p
in
self
.
student
.
parameters
()
if
p
.
name
not
in
[
a
.
name
for
a
in
self
.
arch_parameters
()]
]
return
model_parameters
def
genotype
(
self
):
return
self
.
arch_parameters
()
alphas
=
self
.
arch_parameters
()[
0
].
numpy
()
alphas
=
[
np
.
argmax
(
edge
)
for
edge
in
alphas
]
k
=
np
.
argmax
(
self
.
arch_parameters
()[
1
].
numpy
())
return
"layers: {}; edges: {} "
.
format
(
k
,
alphas
)
def
new
(
self
):
model_new
=
AdaBERTClassifier
(
...
...
@@ -108,8 +128,7 @@ class AdaBERTClassifier(Layer):
)
return
model_new
def
loss
(
self
,
data_ids
):
T
=
1.0
def
valid
(
self
,
data_ids
):
src_ids
=
data_ids
[
0
]
position_ids
=
data_ids
[
1
]
sentence_ids
=
data_ids
[
2
]
...
...
@@ -117,16 +136,57 @@ class AdaBERTClassifier(Layer):
labels
=
data_ids
[
4
]
flops
=
[]
model_size
=
[]
alphas
=
self
.
arch_parameters
()[
0
].
numpy
(
)
if
self
.
_alphas
is
None
else
self
.
_alphas
k
=
self
.
arch_parameters
()[
1
].
numpy
()
if
self
.
_k
is
None
else
self
.
_k
print
(
alphas
.
shape
)
print
(
k
.
shape
)
enc_outputs
,
next_sent_feats
,
k_i
=
self
.
student
(
src_ids
,
position_ids
,
sentence_ids
,
flops
=
flops
,
model_size
=
model_size
)
model_size
=
model_size
,
alphas
=
alphas
,
k
=
k
)
logits
=
self
.
cls_fc
[
-
1
](
next_sent_feats
[
-
1
])
probs
=
fluid
.
layers
.
softmax
(
logits
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
labels
)
model_size
=
np
.
sum
(
model_size
)
flops
=
np
.
sum
(
flops
)
ret
=
{
"accuracy"
:
accuracy
.
numpy
(),
"model_size(MB)"
:
model_size
/
1e6
,
"FLOPs(M)"
:
flops
/
1e6
}
return
ret
def
loss
(
self
,
data_ids
):
T
=
1.0
src_ids
=
data_ids
[
0
]
position_ids
=
data_ids
[
1
]
sentence_ids
=
data_ids
[
2
]
input_mask
=
data_ids
[
3
]
labels
=
data_ids
[
4
]
flops
=
[]
model_size
=
[]
self
.
teacher
.
eval
()
total_loss
,
t_logits
,
t_losses
,
accuracys
,
num_seqs
=
self
.
teacher
(
data_ids
)
self
.
teacher
.
train
()
enc_outputs
,
next_sent_feats
,
k_i
=
self
.
student
(
src_ids
,
position_ids
,
sentence_ids
,
flops
=
flops
,
model_size
=
model_size
,
alphas
=
self
.
_alphas
,
k
=
self
.
_k
)
# define kd loss
kd_losses
=
[]
...
...
@@ -140,21 +200,16 @@ class AdaBERTClassifier(Layer):
kd_weights
=
np
.
exp
(
kd_weights
-
np
.
max
(
kd_weights
))
kd_weights
=
kd_weights
/
kd_weights
.
sum
(
axis
=
0
)
s_probs
=
None
for
i
in
range
(
len
(
next_sent_feats
)):
j
=
int
(
np
.
ceil
(
i
*
(
float
(
len
(
t_logits
))
/
len
(
next_sent_feats
))))
t_logit
=
t_logits
[
j
]
s_sent_feat
=
next_sent_feats
[
i
]
fc
=
self
.
cls_fc
[
i
]
s_sent_feat
=
fluid
.
layers
.
dropout
(
x
=
s_sent_feat
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
s_logits
=
fc
(
s_sent_feat
)
t_logit
.
stop_gradient
=
True
t_probs
=
fluid
.
layers
.
softmax
(
t_logit
)
s_probs
=
fluid
.
layers
.
softmax
(
s_logits
)
t_probs
.
stop_gradient
=
True
kd_loss
=
t_probs
*
fluid
.
layers
.
log
(
s_probs
/
T
)
kd_loss
=
fluid
.
layers
.
reduce_sum
(
kd_loss
,
dim
=
1
)
kd_loss
=
kd_loss
*
kd_weights
[
i
]
...
...
@@ -167,17 +222,28 @@ class AdaBERTClassifier(Layer):
ce_loss
=
fluid
.
layers
.
cross_entropy
(
s_probs
,
labels
)
ce_loss
=
fluid
.
layers
.
reduce_mean
(
ce_loss
)
*
k_i
len_model_size
=
len
(
model_size
)
# define e loss
model_size
=
fluid
.
layers
.
sum
(
model_size
)
# print("model_size: {}".format(model_size.numpy()/1e6))
if
self
.
_alphas
is
not
None
:
flops
=
np
.
sum
(
flops
)
model_size
=
np
.
sum
(
model_size
)
else
:
flops
=
fluid
.
layers
.
sum
(
flops
)
model_size
=
fluid
.
layers
.
sum
(
model_size
)
model_size
=
model_size
/
self
.
student
.
max_model_size
()
flops
=
fluid
.
layers
.
sum
(
flops
)
/
self
.
student
.
max_flops
()
e_loss
=
(
len
(
next_sent_feats
)
*
k_i
/
self
.
_n_layer
)
*
(
flops
+
model_size
)
flops
=
flops
/
self
.
student
.
max_flops
()
e_loss
=
(
flops
+
model_size
)
*
(
len
(
next_sent_feats
)
*
k_i
/
self
.
_n_layer
)
print
(
"len(next_sent_feats): {}; k_i: {}; flops: {}; model_size: {}; len: {}"
.
format
(
len
(
next_sent_feats
),
k_i
,
flops
.
numpy
(),
model_size
.
numpy
(),
len_model_size
))
# define total loss
loss
=
(
1
-
self
.
_gamma
)
*
ce_loss
-
self
.
_gamma
*
kd_loss
+
self
.
_beta
*
e_loss
# print("ce_loss: {}; kd_loss: {}; e_loss: {}".format((
# 1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta *
# e_loss.numpy()))
return
loss
,
ce_loss
,
kd_loss
,
e_loss
# loss = ce_loss + self._beta * e_loss
# return loss, ce_loss, ce_loss, e_loss
paddleslim/nas/darts/search_space/conv_bert/model/bert.py
浏览文件 @
70c6b708
...
...
@@ -110,10 +110,9 @@ class BertModelLayer(Layer):
position_ids
,
sentence_ids
,
flops
=
[],
model_size
=
[]):
"""
forward
"""
model_size
=
[],
alphas
=
None
,
k
=
None
):
src_emb
=
self
.
_src_emb
(
src_ids
)
pos_emb
=
self
.
_pos_emb
(
position_ids
)
sent_emb
=
self
.
_sent_emb
(
sentence_ids
)
...
...
@@ -122,9 +121,8 @@ class BertModelLayer(Layer):
emb_out
=
emb_out
+
sent_emb
emb_out
=
self
.
_emb_fac
(
emb_out
)
enc_outputs
,
k_i
=
self
.
_encoder
(
emb_out
,
flops
=
flops
,
model_size
=
model_size
)
emb_out
,
flops
=
flops
,
model_size
=
model_size
,
alphas
=
alphas
,
k
=
k
)
if
not
self
.
return_pooled_out
:
return
enc_outputs
...
...
paddleslim/nas/darts/search_space/conv_bert/model/cls.py
浏览文件 @
70c6b708
...
...
@@ -57,12 +57,12 @@ class ClsModelLayer(Layer):
fc
=
Linear
(
input_dim
=
self
.
config
[
"hidden_size"
],
output_dim
=
num_labels
,
param_attr
=
fluid
.
ParamA
ttr
(
param_attr
=
fluid
.
parama
ttr
(
name
=
"cls_out_%d_w"
%
i
,
initializer
=
fluid
.
initializer
.
TruncatedN
ormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamA
ttr
(
initializer
=
fluid
.
initializer
.
truncatedn
ormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
parama
ttr
(
name
=
"cls_out_%d_b"
%
i
,
initializer
=
fluid
.
initializer
.
C
onstant
(
0.
)))
initializer
=
fluid
.
initializer
.
c
onstant
(
0.
)))
fc
=
self
.
add_sublayer
(
"cls_fc_%d"
%
i
,
fc
)
self
.
cls_fc
.
append
(
fc
)
...
...
paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
浏览文件 @
70c6b708
...
...
@@ -18,6 +18,7 @@ from __future__ import division
from
__future__
import
print_function
import
numpy
as
np
from
collections
import
Iterable
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -25,13 +26,13 @@ from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, Ba
from
paddle.fluid.initializer
import
NormalInitializer
GConv_PRIMITIVES
=
[
'
std_gconv_3'
,
'std_gconv_5'
,
'std_gconv_7'
,
'dil_gconv_3'
,
'dil_gconv_5
'
,
'dil_gconv_
7'
,
'avg_pool_3'
,
'max_pool_3'
,
'none
'
,
'skip_connect'
'
none'
,
'std_gconv_3'
,
'std_gconv_5'
,
'std_gconv_7'
,
'dil_gconv_3
'
,
'dil_gconv_
5'
,
'dil_gconv_7'
,
'avg_pool_3'
,
'max_pool_3
'
,
'skip_connect'
]
ConvBN_PRIMITIVES
=
[
'std_conv_bn_3'
,
'std_conv_bn_5'
,
'std_conv_bn_7'
,
'dil_conv_bn_3'
,
'dil_conv_bn_5'
,
'dil_conv_bn_7'
,
'avg_pool_3'
,
'max_pool_3'
,
'none'
,
'
none'
,
'
std_conv_bn_3'
,
'std_conv_bn_5'
,
'std_conv_bn_7'
,
'dil_conv_bn_3'
,
'dil_conv_bn_5'
,
'dil_conv_bn_7'
,
'avg_pool_3'
,
'max_pool_3'
,
'skip_connect'
]
...
...
@@ -117,7 +118,11 @@ class MixedOp(fluid.dygraph.Layer):
def
forward
(
self
,
x
,
weights
,
flops
=
[],
model_size
=
[]):
for
i
in
range
(
len
(
self
.
_ops
)):
if
weights
[
i
].
numpy
()
!=
0
:
if
isinstance
(
weights
,
Iterable
):
weights_i
=
weights
[
i
]
else
:
weights_i
=
weights
[
i
].
numpy
()
if
weights_i
!=
0
:
flops
.
append
(
FLOPs
.
values
()[
i
]
*
weights
[
i
])
model_size
.
append
(
ModelSize
.
values
()[
i
]
*
weights
[
i
])
return
self
.
_ops
[
i
](
x
)
*
weights
[
i
]
...
...
@@ -166,6 +171,7 @@ class ConvBNRelu(fluid.dygraph.Layer):
use_cudnn
=
True
,
name
=
None
):
super
(
ConvBNRelu
,
self
).
__init__
()
self
.
_name
=
name
conv_std
=
(
2.0
/
(
filter_size
[
0
]
*
filter_size
[
1
]
*
out_c
*
in_c
))
**
0.5
conv_param
=
fluid
.
ParamAttr
(
...
...
@@ -187,7 +193,7 @@ class ConvBNRelu(fluid.dygraph.Layer):
def
forward
(
self
,
inputs
):
conv
=
self
.
conv
(
inputs
)
bn
=
self
.
bn
(
conv
)
return
bn
return
conv
class
GateConv
(
fluid
.
dygraph
.
Layer
):
...
...
@@ -261,24 +267,30 @@ class Cell(fluid.dygraph.Layer):
states
=
[
s0
,
s1
]
offset
=
0
for
i
in
range
(
self
.
_steps
):
s
=
fluid
.
layers
.
sums
([
self
.
_ops
[
offset
+
j
](
h
,
weights
[
offset
+
j
],
flops
=
flops
,
model_size
=
model_size
)
for
j
,
h
in
enumerate
(
states
)
])
edges
=
[]
for
j
,
h
in
enumerate
(
states
):
edge
=
self
.
_ops
[
offset
+
j
](
h
,
weights
[
offset
+
j
],
flops
=
flops
,
model_size
=
model_size
)
edges
.
append
(
edge
)
s
=
edges
[
0
]
for
n
in
range
(
1
,
len
(
edges
)):
s
=
s
+
edges
[
n
]
# s = fluid.layers.sums(edges)
offset
+=
len
(
states
)
states
.
append
(
s
)
out
=
fluid
.
layers
.
sum
(
states
[
-
self
.
_steps
:])
states
=
states
[
-
self
.
_steps
:]
out
=
states
[
0
]
for
n
in
range
(
1
,
len
(
states
)):
out
=
out
+
states
[
n
]
# out = fluid.layers.sums(states[-self._steps:])
return
out
class
EncoderLayer
(
Layer
):
"""
encoder
"""
def
__init__
(
self
,
n_layer
,
hidden_size
=
768
,
...
...
@@ -342,15 +354,17 @@ class EncoderLayer(Layer):
default_initializer
=
NormalInitializer
(
loc
=
0.0
,
scale
=
1e-3
))
def
forward
(
self
,
enc_input
,
flops
=
[],
model_size
=
[]):
def
forward
(
self
,
enc_input
,
flops
=
[],
model_size
=
[]
,
alphas
=
None
,
k
=
None
):
tmp
=
fluid
.
layers
.
reshape
(
enc_input
,
[
-
1
,
1
,
enc_input
.
shape
[
1
],
self
.
_hidden_size
])
#(bs, 1, seq_len, hidden_size)
tmp
=
self
.
conv0
(
tmp
)
# (bs, hidden_size, seq_len, 1)
alphas
=
gumbel_softmax
(
self
.
alphas
)
k
=
fluid
.
layers
.
reshape
(
gumbel_softmax
(
self
.
k
),
[
-
1
])
if
alphas
is
None
:
alphas
=
gumbel_softmax
(
self
.
alphas
)
if
k
is
None
:
k
=
fluid
.
layers
.
reshape
(
gumbel_softmax
(
self
.
k
),
[
-
1
])
outputs
=
[]
s0
=
s1
=
tmp
...
...
@@ -364,7 +378,11 @@ class EncoderLayer(Layer):
enc_output
,
[
-
1
,
enc_output
.
shape
[
1
],
self
.
_hidden_size
])
# (bs, seq_len, hidden_size)
outputs
.
append
(
enc_output
)
if
self
.
_search_layer
and
k
[
i
].
numpy
()
!=
0
:
if
isinstance
(
k
,
Iterable
):
k_i
=
k
[
i
]
else
:
k_i
=
k
[
i
].
numpy
()
if
k_i
!=
0
:
outputs
[
-
1
]
=
outputs
[
-
1
]
*
k
[
i
]
return
outputs
,
k
[
i
]
return
outputs
,
1.0
paddleslim/nas/darts/train_search.py
浏览文件 @
70c6b708
...
...
@@ -18,10 +18,12 @@ from __future__ import print_function
__all__
=
[
'DARTSearch'
]
import
math
import
logging
from
itertools
import
izip
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.framework
import
Variable
from
paddle.fluid.dygraph.base
import
to_variable
from
...common
import
AvgrageMeter
,
get_logger
from
.architect
import
Architect
...
...
@@ -41,6 +43,7 @@ class DARTSearch(object):
model
,
train_reader
,
valid_reader
,
test_reader
=
None
,
learning_rate
=
0.025
,
batchsize
=
64
,
num_imgs
=
50000
,
...
...
@@ -54,6 +57,7 @@ class DARTSearch(object):
self
.
model
=
model
self
.
train_reader
=
train_reader
self
.
valid_reader
=
valid_reader
self
.
test_reader
=
test_reader
self
.
learning_rate
=
learning_rate
self
.
batchsize
=
batchsize
self
.
num_imgs
=
num_imgs
...
...
@@ -82,10 +86,13 @@ class DARTSearch(object):
step_id
=
0
for
train_data
,
valid_data
in
izip
(
train_loader
(),
valid_loader
()):
if
epoch
>=
self
.
epochs_no_archopt
:
architect
.
step
(
train_data
,
valid_data
)
a
lphas_grad
=
a
rchitect
.
step
(
train_data
,
valid_data
)
loss
,
ce_loss
,
kd_loss
,
e_loss
=
self
.
model
.
loss
(
train_data
)
if
math
.
isnan
(
e_loss
.
numpy
()):
print
(
"alphas_grad: {}"
.
format
(
alphas_grad
))
print
(
"alphas: {}"
.
format
(
self
.
model
.
arch_parameters
()[
0
]
.
numpy
()))
if
self
.
use_data_parallel
:
loss
=
self
.
model
.
scale_loss
(
loss
)
loss
.
backward
()
...
...
@@ -93,67 +100,55 @@ class DARTSearch(object):
else
:
loss
.
backward
()
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
5
)
optimizer
.
minimize
(
loss
,
grad_clip
)
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5)
# optimizer.minimize(loss, grad_clip)
optimizer
.
minimize
(
loss
)
self
.
model
.
clear_gradients
()
batch_size
=
train_data
[
0
].
shape
[
0
]
objs
.
update
(
loss
.
numpy
(),
batch_size
)
e_loss
=
e_loss
.
numpy
()
if
isinstance
(
e_loss
,
Variable
)
else
e_loss
ce_losses
.
update
(
ce_loss
.
numpy
(),
batch_size
)
kd_losses
.
update
(
kd_loss
.
numpy
(),
batch_size
)
e_losses
.
update
(
e_loss
.
numpy
()
,
batch_size
)
e_losses
.
update
(
e_loss
,
batch_size
)
if
step_id
%
self
.
log_freq
==
0
:
#logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format(
# epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0]))
logger
.
info
(
"Train Epoch {}, Step {}, loss {}; ce: {}; kd: {}; e: {}"
.
format
(
epoch
,
step_id
,
loss
.
numpy
(),
ce_loss
.
numpy
(),
kd_loss
.
numpy
(),
e_loss
.
numpy
()
))
ce_loss
.
numpy
(),
kd_loss
.
numpy
(),
e_loss
))
step_id
+=
1
return
objs
.
avg
[
0
]
def
valid_one_epoch
(
self
,
valid_loader
,
epoch
):
objs
=
AvgrageMeter
()
top1
=
AvgrageMeter
()
top5
=
AvgrageMeter
()
self
.
model
.
eval
()
meters
=
{}
for
step_id
,
valid_data
in
enumerate
(
valid_loader
):
image
=
to_variable
(
image
)
label
=
to_variable
(
label
)
n
=
image
.
shape
[
0
]
logits
=
self
.
model
(
image
)
prec1
=
fluid
.
layers
.
accuracy
(
input
=
logits
,
label
=
label
,
k
=
1
)
prec5
=
fluid
.
layers
.
accuracy
(
input
=
logits
,
label
=
label
,
k
=
5
)
loss
=
fluid
.
layers
.
reduce_mean
(
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
,
label
))
objs
.
update
(
loss
.
numpy
(),
n
)
top1
.
update
(
prec1
.
numpy
(),
n
)
top5
.
update
(
prec5
.
numpy
(),
n
)
ret
=
self
.
model
.
valid
(
valid_data
)
for
key
,
value
in
ret
.
items
():
if
key
not
in
meters
:
meters
[
key
]
=
AvgrageMeter
()
meters
[
key
].
update
(
value
,
1
)
if
step_id
%
self
.
log_freq
==
0
:
logger
.
info
(
"Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}"
.
format
(
epoch
,
step_id
,
objs
.
avg
[
0
],
top1
.
avg
[
0
],
top5
.
avg
[
0
]))
return
top1
.
avg
[
0
]
logger
.
info
(
"Valid Epoch {}, Step {}, {}"
.
format
(
epoch
,
step_id
,
meters
))
def
train
(
self
):
if
self
.
use_data_parallel
:
strategy
=
fluid
.
dygraph
.
parallel
.
prepare_context
()
model_parameters
=
[
p
for
p
in
self
.
model
.
parameters
()
if
p
.
name
not
in
[
a
.
name
for
a
in
self
.
model
.
arch_parameters
()]
]
logger
.
info
(
"param size = {:.6f}MB"
.
format
(
model_parameters
=
self
.
model
.
model_parameters
()
logger
.
info
(
"parameter size in super net: {:.6f}M"
.
format
(
count_parameters_in_MB
(
model_parameters
)))
step_per_epoch
=
int
(
self
.
num_imgs
*
0.5
/
self
.
batchsize
)
if
self
.
unrolled
:
step_per_epoch
*=
2
learning_rate
=
fluid
.
dygraph
.
CosineDecay
(
self
.
learning_rate
,
step_per_epoch
,
self
.
num_epochs
)
optimizer
=
fluid
.
optimizer
.
MomentumOptimizer
(
learning_rate
,
0.9
,
...
...
@@ -167,6 +162,9 @@ class DARTSearch(object):
self
.
train_reader
)
self
.
valid_reader
=
fluid
.
contrib
.
reader
.
distributed_batch_reader
(
self
.
valid_reader
)
if
self
.
test_reader
is
not
None
:
self
.
test_reader
=
fluid
.
contrib
.
reader
.
distributed_batch_reader
(
self
.
test_reader
)
train_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
64
,
...
...
@@ -182,6 +180,17 @@ class DARTSearch(object):
train_loader
.
set_batch_generator
(
self
.
train_reader
,
places
=
self
.
place
)
valid_loader
.
set_batch_generator
(
self
.
valid_reader
,
places
=
self
.
place
)
if
self
.
test_reader
is
not
None
:
test_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
test_loader
.
set_batch_generator
(
self
.
test_reader
,
places
=
self
.
place
)
else
:
test_loader
=
valid_loader
architect
=
Architect
(
self
.
model
,
learning_rate
,
self
.
arch_learning_rate
,
self
.
place
,
self
.
unrolled
)
...
...
@@ -199,8 +208,8 @@ class DARTSearch(object):
self
.
train_one_epoch
(
train_loader
,
valid_loader
,
architect
,
optimizer
,
epoch
)
if
epoch
==
self
.
num_epochs
-
1
:
# valid_top1 = self.valid_one_epoch(valid_loader, epoch)
logger
.
info
(
"Epoch {}, valid_acc {:.6f}"
.
format
(
epoch
,
1
)
)
if
save_parameters
:
fluid
.
save_dygraph
(
self
.
model
.
state_dict
(),
"./weights"
)
# if epoch == self.num_epochs - 1:
# self.valid_one_epoch(test_loader, epoch
)
#
if save_parameters:
#
fluid.save_dygraph(self.model.state_dict(), "./weights")
paddleslim/teachers/bert/model/transformer_encoder.py
浏览文件 @
70c6b708
...
...
@@ -57,7 +57,7 @@ class PrePostProcessLayer(Layer):
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
self
.
functors
.
append
(
lambda
x
:
fluid
.
layers
.
dropout
(
x
,
dropout_prob
=
dropout_rate
,
is_test
=
Fals
e
))
x
,
dropout_prob
=
dropout_rate
,
is_test
=
Tru
e
))
self
.
exec_order
+=
"d"
def
forward
(
self
,
x
,
residual
=
None
):
...
...
@@ -111,8 +111,8 @@ class PositionwiseFeedForwardLayer(Layer):
hidden
=
fluid
.
layers
.
dropout
(
hidden
,
dropout_prob
=
self
.
_dropout_rate
,
upscale_in_train
=
"upscale_in_train"
,
is_test
=
Fals
e
)
#
upscale_in_train="upscale_in_train",
is_test
=
Tru
e
)
out
=
self
.
_h2o
(
hidden
)
return
out
...
...
@@ -218,13 +218,13 @@ class MultiHeadAttentionLayer(Layer):
#alpha=self._d_model**-0.5)
if
attn_bias
is
not
None
:
product
+=
attn_bias
weights
=
fluid
.
layers
.
softmax
(
product
)
weights
=
fluid
.
layers
.
softmax
(
product
)
# 48
if
self
.
_dropout_rate
:
weights_droped
=
fluid
.
layers
.
dropout
(
weights
,
dropout_prob
=
self
.
_dropout_rate
,
dropout_implementation
=
"upscale_in_train"
,
is_test
=
Fals
e
)
#
dropout_implementation="upscale_in_train",
is_test
=
Tru
e
)
out
=
fluid
.
layers
.
matmul
(
weights_droped
,
transpose_v
)
else
:
out
=
fluid
.
layers
.
matmul
(
weights
,
transpose_v
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录