Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSlim
提交
dc44c944
P
PaddleSlim
项目概览
PaddlePaddle
/
PaddleSlim
1 年多 前同步成功
通知
51
Star
1434
Fork
344
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
16
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSlim
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
16
合并请求
16
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
dc44c944
编写于
4月 01, 2022
作者:
I
iamWHTWD
提交者:
GitHub
4月 01, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Update sa_nas_mobilenetv2.py
上级
76356ff6
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
315 addition
and
1028 deletion
+315
-1028
demo/nas/sa_nas_mobilenetv2.py
demo/nas/sa_nas_mobilenetv2.py
+315
-1028
未找到文件。
demo/nas/sa_nas_mobilenetv2.py
浏览文件 @
dc44c944
#!/usr/bin/env bash
import
sys
##################
sys
.
path
.
append
(
'..'
)
#bash slim_ci_demo_all_case.sh $5 $6;
import
numpy
as
np
import
argparse
print_info
(){
import
ast
if
[
$
1
-
ne
0
];
then
import
time
mv
$
{
log_path
}
/
$
2
$
{
log_path
}
/
FAIL_
$
2.
log
import
argparse
echo
-
e
"
\033
[31m ${log_path}/FAIL_$2
\033
[0m"
import
ast
echo
"fail log as follow"
import
logging
cat
$
{
log_path
}
/
FAIL_
$
2.
log
import
paddle
else
import
paddle.nn
as
nn
mv
$
{
log_path
}
/
$
2
$
{
log_path
}
/
SUCCESS_
$
2.
log
import
paddle.static
as
static
echo
-
e
"
\033
[32m ${log_path}/SUCCESS_$2
\033
[0m"
import
paddle.nn.functional
as
F
cat
$
{
log_path
}
/
SUCCESS_
$
2.
log
import
paddle.vision.transforms
as
T
fi
from
paddle
import
ParamAttr
}
from
paddleslim.analysis
import
flops
from
paddleslim.nas
import
SANAS
catchException
()
{
from
paddleslim.common
import
get_logger
echo
$
1
failed
due
to
exception
>>
FAIL_Exception
.
log
from
optimizer
import
create_optimizer
}
import
imagenet_reader
cudaid1
=
$
1
;
_logger
=
get_logger
(
__name__
,
level
=
logging
.
INFO
)
cudaid2
=
$
2
;
echo
"cudaid1,cudaid2"
,
$
{
cudaid1
},
$
{
cudaid2
}
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
def
build_program
(
main_program
,
#分布式log输出方式
startup_program
,
export
PADDLE_LOG_LEVEL
=
debug
image_shape
,
dataset
,
export
FLAGS_fraction_of_gpu_memory_to_use
=
0.98
archs
,
# data PaddleSlim/demo/data/ILSVRC2012
args
,
cd
$
{
slim_dir
}
/
demo
places
,
if
[
-
d
"data"
];
then
is_test
=
False
):
rm
-
rf
data
with
static
.
program_guard
(
main_program
,
startup_program
):
fi
with
paddle
.
utils
.
unique_name
.
guard
():
wget
-
q
https
:
//
sys
-
p0
.
bj
.
bcebos
.
com
/
slim_ci
/
ILSVRC2012_data_demo
.
tar
.
gz
--
no
-
check
-
certificate
data_shape
=
[
None
]
+
image_shape
tar
xf
ILSVRC2012_data_demo
.
tar
.
gz
data
=
static
.
data
(
name
=
'data'
,
shape
=
data_shape
,
dtype
=
'float32'
)
mv
ILSVRC2012_data_demo
data
label
=
static
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
# download pretrain model
if
args
.
data
==
'cifar10'
:
root_url
=
"http://paddle-imagenet-models-name.bj.bcebos.com"
paddle
.
assign
(
paddle
.
reshape
(
label
,
[
-
1
,
1
]),
label
)
pre_models
=
"MobileNetV1 MobileNetV2 MobileNetV3_large_x1_0_ssld ResNet101_vd MobileNetV2 ResNet34 ResNet50 ResNet50_vd"
if
is_test
:
if
[
-
d
"pretrain"
];
then
data_loader
=
paddle
.
io
.
DataLoader
(
rm
-
rf
pretrain
dataset
,
fi
places
=
places
,
mkdir
pretrain
&&
cd
pretrain
feed_list
=
[
data
,
label
],
for
model
in
$
{
pre_models
}
drop_last
=
False
,
do
batch_size
=
args
.
batch_size
,
if
[
!
-
f
$
{
model
}
];
then
return_list
=
False
,
wget
-
q
$
{
root_url
}
/
$
{
model
}
_pretrained
.
tar
shuffle
=
False
)
tar
xf
$
{
model
}
_pretrained
.
tar
else
:
fi
data_loader
=
paddle
.
io
.
DataLoader
(
done
dataset
,
places
=
places
,
# 1 dist
feed_list
=
[
data
,
label
],
demo_distillation_01
(){
drop_last
=
True
,
cd
$
{
slim_dir
}
/
demo
/
distillation
||
catchException
demo_distillation
batch_size
=
args
.
batch_size
,
if
[
-
d
"output"
];
then
return_list
=
False
,
rm
-
rf
output
shuffle
=
True
,
fi
use_shared_memory
=
True
,
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
num_workers
=
4
)
python
distill
.
py
--
num_epochs
1
--
save_inference
True
>
$
{
log_path
}
/
demo_distillation_ResNet50_vd_T
2
>&
1
output
=
archs
(
data
)
print_info
$?
demo_distillation_ResNet50_vd_T
output
=
static
.
nn
.
fc
(
x
=
output
,
size
=
args
.
class_dim
)
}
softmax_out
=
F
.
softmax
(
output
)
cost
=
F
.
cross_entropy
(
softmax_out
,
label
=
label
)
demo_distillation_02
(){
avg_cost
=
paddle
.
mean
(
cost
)
cd
$
{
slim_dir
}
/
demo
/
distillation
||
catchException
demo_distillation
acc_top1
=
paddle
.
metric
.
accuracy
(
if
[
-
d
"output"
];
then
input
=
softmax_out
,
label
=
label
,
k
=
1
)
rm
-
rf
output
acc_top5
=
paddle
.
metric
.
accuracy
(
fi
input
=
softmax_out
,
label
=
label
,
k
=
5
)
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
if
is_test
==
False
:
python
distill
.
py
--
num_epochs
1
--
batch_size
64
--
save_inference
True
\
optimizer
=
create_optimizer
(
args
)
--
model
ResNet50
--
teacher_model
ResNet101_vd
\
optimizer
.
minimize
(
avg_cost
)
--
teacher_pretrained_model
..
/
pretrain
/
ResNet101_vd_pretrained
>
$
{
log_path
}
/
demo_distillation_ResNet101_vd_ResNet50_T
2
>&
1
return
data_loader
,
avg_cost
,
acc_top1
,
acc_top5
print_info
$?
demo_distillation_ResNet101_vd_ResNet50_T
python
distill
.
py
--
num_epochs
1
--
batch_size
64
--
save_inference
True
\
def
search_mobilenetv2
(
config
,
args
,
image_size
,
is_server
=
True
):
--
model
MobileNetV2_x0_25
--
teacher_model
MobileNetV2
\
image_shape
=
[
3
,
image_size
,
image_size
]
--
teacher_pretrained_model
..
/
pretrain
/
MobileNetV2_pretrained
>
$
{
log_path
}
/
demo_distillation_MobileNetV2_MobileNetV2_x0_25_T
2
>&
1
if
args
.
data
==
'cifar10'
:
print_info
$?
demo_distillation_MobileNetV2_MobileNetV2_x0_25_T
transform
=
T
.
Compose
([
T
.
Transpose
(),
T
.
Normalize
([
127.5
],
[
127.5
])])
}
train_dataset
=
paddle
.
vision
.
datasets
.
Cifar10
(
mode
=
'train'
,
transform
=
transform
,
backend
=
'cv2'
)
demo_deep_mutual_learning
(){
val_dataset
=
paddle
.
vision
.
datasets
.
Cifar10
(
cd
$
{
slim_dir
}
/
demo
/
deep_mutual_learning
||
catchException
demo_deep_mutual_learning
mode
=
'test'
,
transform
=
transform
,
backend
=
'cv2'
)
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
model
=
dml_mv1_mv1_gpu1
elif
args
.
data
==
'imagenet'
:
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
train_dataset
=
imagenet_reader
.
ImageNetDataset
(
mode
=
'train'
)
python
dml_train
.
py
--
epochs
1
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
val_dataset
=
imagenet_reader
.
ImageNetDataset
(
mode
=
'val'
)
print_info
$?
$
{
model
}
model
=
dml_mv1_res50_gpu1
places
=
static
.
cuda_places
()
if
args
.
use_gpu
else
static
.
cpu_places
()
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
place
=
places
[
0
]
python
dml_train
.
py
--
models
=
'mobilenet-resnet50'
--
batch_size
128
--
epochs
1
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
if
is_server
:
print_info
$?
$
{
model
}
### start a server and a client
}
sa_nas
=
SANAS
(
config
,
all_distillation
(){
# 大数据 5个模型
server_addr
=
(
args
.
server_address
,
args
.
port
),
demo_distillation_01
# 3
search_steps
=
args
.
search_steps
,
#demo_distillation_02
is_server
=
True
)
#demo_deep_mutual_learning # 2
else
:
}
### start a client
# 2.1 quant/quant_aware 使用小数据集即可
sa_nas
=
SANAS
(
demo_quant_quant_aware
(){
config
,
cd
$
{
slim_dir
}
/
demo
/
quant
/
quant_aware
||
catchException
demo_quant_quant_aware
server_addr
=
(
args
.
server_address
,
args
.
port
),
if
[
-
d
"output"
];
then
search_steps
=
args
.
search_steps
,
rm
-
rf
output
is_server
=
False
)
fi
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
for
step
in
range
(
args
.
search_steps
):
# 2.1版本时默认BS=256会报显存不足,故暂时修改成128
archs
=
sa_nas
.
next_archs
()[
0
]
python
train
.
py
--
model
MobileNet
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV1_pretrained
\
--
checkpoint_dir
.
/
output
/
mobilenetv1
--
num_epochs
1
--
batch_size
128
>
$
{
log_path
}
/
demo_quant_quant_aware_v1
2
>&
1
train_program
=
static
.
Program
()
print_info
$?
demo_quant_quant_aware_v1
test_program
=
static
.
Program
()
startup_program
=
static
.
Program
()
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
train_loader
,
avg_cost
,
acc_top1
,
acc_top5
=
build_program
(
python
train
.
py
--
model
ResNet34
\
train_program
,
startup_program
,
image_shape
,
train_dataset
,
archs
,
--
pretrained_model
..
/
..
/
pretrain
/
ResNet34_pretrained
\
args
,
places
)
--
checkpoint_dir
.
/
output
/
ResNet34
--
num_epochs
1
>
$
{
log_path
}
/
demo_quant_quant_aware_ResNet34_T
2
>&
1
print_info
$?
demo_quant_quant_aware_ResNet34_T
current_flops
=
flops
(
train_program
)
}
print
(
'step: {}, current_flops: {}'
.
format
(
step
,
current_flops
))
# 2.2 quant/quant_embedding
if
current_flops
>
int
(
321208544
):
demo_quant_quant_embedding
(){
continue
cd
$
{
slim_dir
}
/
demo
/
quant
/
quant_embedding
||
catchException
demo_quant_quant_embedding
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
test_loader
,
test_avg_cost
,
test_acc_top1
,
test_acc_top5
=
build_program
(
# 先使用word2vec的demo数据进行一轮训练,比较量化前infer结果同量化后infer结果different
test_program
,
if
[
-
d
"data"
];
then
startup_program
,
rm
-
rf
data
image_shape
,
fi
val_dataset
,
wget
-
q
https
:
//
sys
-
p0
.
bj
.
bcebos
.
com
/
slim_ci
/
word_2evc_demo_data
.
tar
.
gz
--
no
-
check
-
certificate
archs
,
tar
xf
word_2evc_demo_data
.
tar
.
gz
args
,
mv
word_2evc_demo_data
data
place
,
if
[
-
d
"v1_cpu5_b100_lr1dir"
];
then
is_test
=
True
)
rm
-
rf
v1_cpu5_b100_lr1dir
test_program
=
test_program
.
clone
(
for_test
=
True
)
fi
OPENBLAS_NUM_THREADS
=
1
CPU_NUM
=
5
python
train
.
py
--
train_data_dir
data
/
convert_text8
\
exe
=
static
.
Executor
(
place
)
--
dict_path
data
/
test_build_dict
--
num_passes
1
--
batch_size
100
--
model_output_dir
v1_cpu5_b100_lr1dir
\
exe
.
run
(
startup_program
)
--
base_lr
1.0
--
print_batch
1000
--
with_speed
--
is_sparse
>
$
{
log_path
}
/
quant_em_word2vec_T
2
>&
1
print_info
$?
quant_em_word2vec_T
build_strategy
=
static
.
BuildStrategy
()
# 量化前infer
train_compiled_program
=
static
.
CompiledProgram
(
python
infer
.
py
--
infer_epoch
--
test_dir
data
/
test_mid_dir
\
train_program
).
with_data_parallel
(
--
dict_path
data
/
test_build_dict_word_to_id_
\
loss_name
=
avg_cost
.
name
,
build_strategy
=
build_strategy
)
--
batch_size
20000
--
model_dir
v1_cpu5_b100_lr1dir
/
\
for
epoch_id
in
range
(
args
.
retain_epoch
):
--
start_index
0
--
last_index
0
>
$
{
log_path
}
/
quant_em_infer1
2
>&
1
for
batch_id
,
data
in
enumerate
(
train_loader
()):
print_info
$?
quant_em_infer1
fetches
=
[
avg_cost
.
name
]
# 量化后infer
s_time
=
time
.
time
()
python
infer
.
py
--
infer_epoch
--
test_dir
data
/
test_mid_dir
\
outs
=
exe
.
run
(
train_compiled_program
,
--
dict_path
data
/
test_build_dict_word_to_id_
\
feed
=
data
,
--
batch_size
20000
--
model_dir
v1_cpu5_b100_lr1dir
/
--
start_index
0
\
fetch_list
=
fetches
)[
0
]
--
last_index
0
--
emb_quant
True
>
$
{
log_path
}
/
quant_em_infer2
2
>&
1
batch_time
=
time
.
time
()
-
s_time
print_info
$?
quant_em_infer2
if
batch_id
%
10
==
0
:
}
_logger
.
info
(
# 2.3 quan_post # 小数据集
'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'
.
demo_quant_quant_post
(){
format
(
step
,
epoch_id
,
batch_id
,
outs
[
0
],
batch_time
))
# 20210425 新增4种离线量化方法
cd
$
{
slim_dir
}
/
demo
/
quant
/
quant_post
||
catchException
demo_quant_quant_post
reward
=
[]
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
for
batch_id
,
data
in
enumerate
(
test_loader
()):
# 1 导出模型
test_fetches
=
[
python
export_model
.
py
--
model
"MobileNet"
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV1_pretrained
\
test_avg_cost
.
name
,
test_acc_top1
.
name
,
test_acc_top5
.
name
--
data
imagenet
>
$
{
log_path
}
/
st_quant_post_v1_export
2
>&
1
]
print_info
$?
st_quant_post_v1_export
batch_reward
=
exe
.
run
(
test_program
,
# 量化前eval
feed
=
data
,
python
eval
.
py
--
model_path
.
/
inference_model
/
MobileNet
--
model_name
model
\
fetch_list
=
test_fetches
)
--
params_name
weights
>
$
{
log_path
}
/
st_quant_post_v1_eval1
2
>&
1
reward_avg
=
np
.
mean
(
np
.
array
(
batch_reward
),
axis
=
1
)
print_info
$?
st_quant_post_v1_eval1
reward
.
append
(
reward_avg
)
# 3 离线量化
_logger
.
info
(
# 4 量化后eval
'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'
.
for
algo
in
hist
avg
mse
format
(
step
,
batch_id
,
batch_reward
[
0
],
batch_reward
[
1
],
do
batch_reward
[
2
]))
## 不带bc 离线量化
echo
"quant_post train no bc "
$
{
algo
}
finally_reward
=
np
.
mean
(
np
.
array
(
reward
),
axis
=
0
)
python
quant_post
.
py
--
model_path
.
/
inference_model
/
MobileNet
\
_logger
.
info
(
--
save_path
.
/
quant_model
/
$
{
algo
}
/
MobileNet
\
'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'
.
format
(
--
model_filename
model
--
params_filename
weights
--
algo
$
{
algo
}
>
$
{
log_path
}
/
st_quant_post_v1_T_
$
{
algo
}
2
>&
1
finally_reward
[
0
],
finally_reward
[
1
],
finally_reward
[
2
]))
print_info
$?
st_quant_post_v1_T_
$
{
algo
}
# 量化后eval
sa_nas
.
reward
(
float
(
finally_reward
[
1
]))
echo
"quant_post eval no bc "
$
{
algo
}
python
eval
.
py
--
model_path
.
/
quant_model
/
$
{
algo
}
/
MobileNet
--
model_name
__model__
\
--
params_name
__params__
>
$
{
log_path
}
/
st_quant_post_
$
{
algo
}
_eval2
2
>&
1
def
test_search_result
(
tokens
,
image_size
,
args
,
config
):
print_info
$?
st_quant_post_
$
{
algo
}
_eval2
places
=
static
.
cuda_places
()
if
args
.
use_gpu
else
static
.
cpu_places
()
place
=
places
[
0
]
# 带bc参数的 离线量化
echo
"quant_post train bc "
$
{
algo
}
sa_nas
=
SANAS
(
python
quant_post
.
py
--
model_path
.
/
inference_model
/
MobileNet
\
config
,
--
save_path
.
/
quant_model
/
$
{
algo
}
_bc
/
MobileNet
\
server_addr
=
(
args
.
server_address
,
args
.
port
),
--
model_filename
model
--
params_filename
weights
\
search_steps
=
args
.
search_steps
,
--
algo
$
{
algo
}
--
bias_correction
True
>
$
{
log_path
}
/
st_quant_post_T_
$
{
algo
}
_bc
2
>&
1
is_server
=
True
)
print_info
$?
st_quant_post_T_
$
{
algo
}
_bc
image_shape
=
[
3
,
image_size
,
image_size
]
# 量化后eval
if
args
.
data
==
'cifar10'
:
echo
"quant_post eval bc "
$
{
algo
}
transform
=
T
.
Compose
([
T
.
Transpose
(),
T
.
Normalize
([
127.5
],
[
127.5
])])
python
eval
.
py
--
model_path
.
/
quant_model
/
$
{
algo
}
_bc
/
MobileNet
--
model_name
__model__
\
train_dataset
=
paddle
.
vision
.
datasets
.
Cifar10
(
--
params_name
__params__
>
$
{
log_path
}
/
st_quant_post_
$
{
algo
}
_bc_eval2
2
>&
1
mode
=
'train'
,
transform
=
transform
,
backend
=
'cv2'
)
print_info
$?
st_quant_post_
$
{
algo
}
_bc_eval2
val_dataset
=
paddle
.
vision
.
datasets
.
Cifar10
(
mode
=
'test'
,
transform
=
transform
,
backend
=
'cv2'
)
done
}
elif
args
.
data
==
'imagenet'
:
train_dataset
=
imagenet_reader
.
ImageNetDataset
(
mode
=
'train'
)
# 2.3 quant_post_hpo # 小数据集
val_dataset
=
imagenet_reader
.
ImageNetDataset
(
mode
=
'val'
)
demo_quant_quant_post_hpo
(){
archs
=
sa_nas
.
tokens2arch
(
tokens
)[
0
]
cd
$
{
slim_dir
}
/
demo
/
quant
/
quant_post_hpo
||
catchException
demo_quant_quant_post_hpo
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
train_program
=
static
.
Program
()
# 1.导出模型
test_program
=
static
.
Program
()
python
..
/
quant_post
/
export_model
.
py
\
startup_program
=
static
.
Program
()
--
model
"MobileNet"
\
train_loader
,
avg_cost
,
acc_top1
,
acc_top5
=
build_program
(
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV1_pretrained
\
train_program
,
startup_program
,
image_shape
,
train_dataset
,
archs
,
args
,
--
data
imagenet
>
$
{
log_path
}
/
st_quant_post__hpo_v1_export
2
>&
1
places
)
print_info
$?
st_quant_post__hpo_v1_export
# 2. quant_post_hpo 设置max_model_quant_count=2
current_flops
=
flops
(
train_program
)
python
quant_post_hpo
.
py
\
print
(
'current_flops: {}'
.
format
(
current_flops
))
--
use_gpu
=
True
\
test_loader
,
test_avg_cost
,
test_acc_top1
,
test_acc_top5
=
build_program
(
--
model_path
=
"./inference_model/MobileNet/"
\
test_program
,
--
save_path
=
"./inference_model/MobileNet_quant/"
\
startup_program
,
--
model_filename
=
"model"
\
image_shape
,
--
params_filename
=
"weights"
\
val_dataset
,
--
max_model_quant_count
=
2
>
$
{
log_path
}
/
st_quant_post_hpo
2
>&
1
archs
,
print_info
$?
st_quant_post_hpo
args
,
# 3. 量化后eval
place
,
python
..
/
quant_post
/
eval
.
py
\
is_test
=
True
)
--
model_path
.
/
inference_model
/
MobileNet_quant
\
--
model_name
__model__
\
test_program
=
test_program
.
clone
(
for_test
=
True
)
--
params_name
__params__
>
$
{
log_path
}
/
st_quant_post_hpo_eval
2
>&
1
print_info
$?
st_quant_post_hpo_eval
exe
=
static
.
Executor
(
place
)
exe
.
run
(
startup_program
)
}
build_strategy
=
static
.
BuildStrategy
()
#2.4
train_compiled_program
=
static
.
CompiledProgram
(
demo_quant_pact_quant_aware
(){
train_program
).
with_data_parallel
(
cd
$
{
slim_dir
}
/
demo
/
quant
/
pact_quant_aware
||
catchException
demo_quant_pact_quant_aware
loss_name
=
avg_cost
.
name
,
build_strategy
=
build_strategy
)
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
for
epoch_id
in
range
(
args
.
retain_epoch
):
# 普通量化,使用小数据集即可
for
batch_id
,
data
in
enumerate
(
train_loader
()):
# 2.1版本时默认BS=128 会报显存不足,故暂时修改成64
fetches
=
[
avg_cost
.
name
]
python
train
.
py
--
model
MobileNetV3_large_x1_0
\
s_time
=
time
.
time
()
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV3_large_x1_0_ssld_pretrained
\
outs
=
exe
.
run
(
train_compiled_program
,
--
num_epochs
1
--
lr
0.0001
--
use_pact
False
--
batch_size
128
>
$
{
log_path
}
/
demo_quant_pact_quant_aware_v3_nopact
2
>&
1
feed
=
data
,
print_info
$?
demo_quant_pact_quant_aware_v3_nopact
fetch_list
=
fetches
)[
0
]
python
train
.
py
--
model
MobileNetV3_large_x1_0
\
batch_time
=
time
.
time
()
-
s_time
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV3_large_x1_0_ssld_pretrained
\
if
batch_id
%
10
==
0
:
--
num_epochs
1
--
lr
0.0001
--
use_pact
True
--
batch_size
64
--
lr_strategy
=
piecewise_decay
\
_logger
.
info
(
--
step_epochs
2
--
l2_decay
1e-5
>
$
{
log_path
}
/
demo_quant_pact_quant_aware_v3
2
>&
1
'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'
.
print_info
$?
demo_quant_pact_quant_aware_v3
format
(
epoch_id
,
batch_id
,
outs
[
0
],
batch_time
))
# load
python
train
.
py
--
model
MobileNetV3_large_x1_0
\
reward
=
[]
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV3_large_x1_0_ssld_pretrained
\
for
batch_id
,
data
in
enumerate
(
test_loader
()):
--
num_epochs
2
--
lr
0.0001
--
use_pact
True
--
batch_size
64
--
lr_strategy
=
piecewise_decay
\
test_fetches
=
[
--
step_epochs
20
--
l2_decay
1e-5
\
test_avg_cost
.
name
,
test_acc_top1
.
name
,
test_acc_top5
.
name
--
checkpoint_dir
.
/
output
/
MobileNetV3_large_x1_0
/
0
\
]
--
checkpoint_epoch
0
>
$
{
log_path
}
/
demo_quant_pact_quant_aware_v3_load
2
>&
1
batch_reward
=
exe
.
run
(
test_program
,
print_info
$?
demo_quant_pact_quant_aware_v3_load
feed
=
data
,
}
fetch_list
=
test_fetches
)
reward_avg
=
np
.
mean
(
np
.
array
(
batch_reward
),
axis
=
1
)
# 2.5
reward
.
append
(
reward_avg
)
demo_dygraph_quant
(){
cd
$
{
slim_dir
}
/
demo
/
dygraph
/
quant
||
catchException
demo_dygraph_quant
_logger
.
info
(
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
--
model
=
'mobilenet_v1'
\
'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'
.
--
pretrained_model
'../../pretrain/MobileNetV1_pretrained'
\
format
(
batch_id
,
batch_reward
[
0
],
batch_reward
[
1
],
batch_reward
[
--
num_epochs
1
\
2
]))
--
batch_size
128
\
>
$
{
log_path
}
/
dy_quant_v1_gpu1
2
>&
1
finally_reward
=
np
.
mean
(
np
.
array
(
reward
),
axis
=
0
)
print_info
$?
dy_quant_v1_gpu1
_logger
.
info
(
# dy_pact_v3
'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'
.
format
(
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
--
lr
=
0.001
\
finally_reward
[
0
],
finally_reward
[
1
],
finally_reward
[
2
]))
--
batch_size
128
\
--
use_pact
=
True
--
num_epochs
=
1
--
l2_decay
=
2e-5
--
ls_epsilon
=
0.1
\
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV3_large_x1_0_ssld_pretrained
\
if
__name__
==
'__main__'
:
--
num_epochs
1
>
$
{
log_path
}
/
dy_pact_quant_v3_gpu1
2
>&
1
print_info
$?
dy_pact_quant_v3_gpu1
parser
=
argparse
.
ArgumentParser
(
# 多卡训练,以0到3号卡为例
description
=
'SA NAS MobileNetV2 cifar10 argparase'
)
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
parser
.
add_argument
(
train
.
py
--
lr
=
0.001
\
'--use_gpu'
,
--
pretrained_model
..
/
..
/
pretrain
/
MobileNetV3_large_x1_0_ssld_pretrained
\
type
=
ast
.
literal_eval
,
--
use_pact
=
True
--
num_epochs
=
1
\
default
=
True
,
--
l2_decay
=
2e-5
\
help
=
'Whether to use GPU in train/test model.'
)
--
ls_epsilon
=
0.1
\
parser
.
add_argument
(
--
batch_size
=
128
\
'--batch_size'
,
type
=
int
,
default
=
256
,
help
=
'batch size.'
)
--
model_save_dir
output
>
$
{
log_path
}
/
dy_pact_quant_v3_gpu4
2
>&
1
parser
.
add_argument
(
print_info
$?
dy_pact_quant_v3_gpu4
'--class_dim'
,
type
=
int
,
default
=
10
,
help
=
'classify number.'
)
}
parser
.
add_argument
(
# 2.6
'--data'
,
ce_tests_dygraph_qat
(){
type
=
str
,
cd
$
{
slim_dir
}
/
ce_tests
/
dygraph
/
quant
||
catchException
ce_tests_dygraph_qat
default
=
'cifar10'
,
ln
-
s
$
{
slim_dir
}
/
demo
/
data
/
ILSVRC2012
choices
=
[
'cifar10'
,
'imagenet'
],
test_samples
=
1000
# if set as -1, use all test samples
help
=
'server address.'
)
data_path
=
'./ILSVRC2012/'
parser
.
add_argument
(
batch_size
=
16
'--is_server'
,
epoch
=
1
type
=
ast
.
literal_eval
,
lr
=
0.0001
default
=
True
,
num_workers
=
1
help
=
'Whether to start a server.'
)
output_dir
=
$
PWD
/
output_models
parser
.
add_argument
(
for
model
in
mobilenet_v1
'--search_steps'
,
do
type
=
int
,
# if [ $1 == nopact ];then
default
=
100
,
# 1 quant train
help
=
'controller server number.'
)
echo
"------1 nopact train--------"
,
$
{
model
}
parser
.
add_argument
(
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
'--server_address'
,
type
=
str
,
default
=
""
,
help
=
'server ip.'
)
python
.
/
src
/
qat
.
py
\
parser
.
add_argument
(
'--port'
,
type
=
int
,
default
=
8881
,
help
=
'server port'
)
--
arch
=
$
{
model
}
\
parser
.
add_argument
(
--
data
=
$
{
data_path
}
\
'--retain_epoch'
,
type
=
int
,
default
=
5
,
help
=
'epoch for each token.'
)
--
epoch
=
$
{
epoch
}
\
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.1
,
help
=
'learning rate.'
)
--
batch_size
=
32
\
args
=
parser
.
parse_args
()
--
num_workers
=
$
{
num_workers
}
\
print
(
args
)
--
lr
=
$
{
lr
}
\
--
output_dir
=
$
{
output_dir
}
\
if
args
.
data
==
'cifar10'
:
--
enable_quant
>
qat_
$
{
model
}
_gpu1_nw1
2
>&
1
image_size
=
32
# 2 eval before save quant
block_num
=
3
echo
"--------2 eval before save quant -------------"
,
$
{
model
}
elif
args
.
data
==
'imagenet'
:
python
.
/
src
/
eval
.
py
\
image_size
=
224
--
model_path
=
.
/
output_models
/
quant_dygraph
/
$
{
model
}
\
block_num
=
6
--
data_dir
=
$
{
data_path
}
\
else
:
--
test_samples
=
$
{
test_samples
}
\
raise
NotImplementedError
(
--
batch_size
=
$
{
batch_size
}
>
eval_before_save_
$
{
model
}
2
>&
1
'data must in [cifar10, imagenet], but received: {}'
.
format
(
# 3 CPU上部署量化模型,需要使用`test/save_quant_model.py`脚本进行模型转换。
args
.
data
))
echo
"--------3 save_nopact_quant_model-------------"
,
$
{
model
}
python
src
/
save_quant_model
.
py
\
config
=
[(
'MobileNetV2Space'
)]
--
load_model_path
output_models
/
quant_dygraph
/
$
{
model
}
\
paddle
.
enable_static
()
--
save_model_path
int8_models
/
$
{
model
}
>
save_quant_
$
{
model
}
2
>&
1
search_mobilenetv2
(
config
,
args
,
image_size
,
is_server
=
args
.
is_server
)
# 4
echo
"--------4 CPU eval after save nopact quant -------------"
,
$
{
model
}
export
CUDA_VISIBLE_DEVICES
=
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
int8_models
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
cpu_eval_after_save_
$
{
model
}
2
>&
1
# elif [ $1 == pact ];then
# 1 pact quant train
echo
"------1 pact train--------"
,
$
{
model
}
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
.
/
src
/
qat
.
py
\
--
arch
=
$
{
model
}
\
--
data
=
$
{
data_path
}
\
--
epoch
=
$
{
epoch
}
\
--
batch_size
=
32
\
--
num_workers
=
$
{
num_workers
}
\
--
lr
=
$
{
lr
}
\
--
output_dir
=
$
PWD
/
output_models_pact
/
\
--
enable_quant
\
--
use_pact
>
pact_qat_
$
{
model
}
_gpu1_nw1
2
>&
1
# 2 eval before save quant
echo
"--------2 eval before save pact quant -------------"
,
$
{
model
}
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
output_models_pact
/
quant_dygraph
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
eval_before_pact_save_
$
{
model
}
2
>&
1
echo
"--------3 save pact quant -------------"
,
$
{
model
}
python
src
/
save_quant_model
.
py
\
--
load_model_path
output_models_pact
/
quant_dygraph
/
$
{
model
}
\
--
save_model_path
int8_models_pact
/
$
{
model
}
>
save_pact_quant_
$
{
model
}
2
>&
1
echo
"--------4 CPU eval after save pact quant -------------"
,
$
{
model
}
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
int8_models_pact
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
cpu_eval_after_pact_save_
$
{
model
}
2
>&
1
# fi
done
}
ce_tests_dygraph_qat
(){
cd
$
{
slim_dir
}
/
ce_tests
/
dygraph
/
quant
||
catchException
ce_tests_dygraph_qat4
ln
-
s
$
{
slim_dir
}
/
demo
/
data
/
ILSVRC2012
test_samples
=
1000
# if set as -1, use all test samples
data_path
=
'./ILSVRC2012/'
batch_size
=
16
epoch
=
1
lr
=
0.0001
num_workers
=
1
output_dir
=
$
PWD
/
output_models
for
model
in
mobilenet_v1
#for model in mobilenet_v1 mobilenet_v2 resnet50 vgg16
do
# if [ $1 == nopact ];then
# 1 quant train
echo
"------1 nopact train--------"
,
$
{
model
}
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
.
/
src
/
qat
.
py
\
--
arch
=
$
{
model
}
\
--
data
=
$
{
data_path
}
\
--
epoch
=
$
{
epoch
}
\
--
batch_size
=
32
\
--
num_workers
=
$
{
num_workers
}
\
--
lr
=
$
{
lr
}
\
--
output_dir
=
$
{
output_dir
}
\
--
enable_quant
>
qat_
$
{
model
}
_gpu1_nw1
2
>&
1
# 2 eval before save quant
echo
"--------2 eval before save quant -------------"
,
$
{
model
}
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
output_models
/
quant_dygraph
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
eval_before_save_
$
{
model
}
2
>&
1
# 3 CPU上部署量化模型,需要使用`test/save_quant_model.py`脚本进行模型转换。
echo
"--------3 save_nopact_quant_model-------------"
,
$
{
model
}
python
src
/
save_quant_model
.
py
\
--
load_model_path
output_models
/
quant_dygraph
/
$
{
model
}
\
--
save_model_path
int8_models
/
$
{
model
}
>
save_quant_
$
{
model
}
2
>&
1
# 4
echo
"--------4 CPU eval after save nopact quant -------------"
,
$
{
model
}
export
CUDA_VISIBLE_DEVICES
=
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
int8_models
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
cpu_eval_after_save_
$
{
model
}
2
>&
1
# elif [ $1 == pact ];then
# 1 pact quant train
echo
"------1 pact train--------"
,
$
{
model
}
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
.
/
src
/
qat
.
py
\
--
arch
=
$
{
model
}
\
--
data
=
$
{
data_path
}
\
--
epoch
=
$
{
epoch
}
\
--
batch_size
=
32
\
--
num_workers
=
$
{
num_workers
}
\
--
lr
=
$
{
lr
}
\
--
output_dir
=
$
PWD
/
output_models_pact
/
\
--
enable_quant
\
--
use_pact
>
pact_qat_
$
{
model
}
_gpu1_nw1
2
>&
1
# 2 eval before save quant
echo
"--------2 eval before save pact quant -------------"
,
$
{
model
}
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
output_models_pact
/
quant_dygraph
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
eval_before_pact_save_
$
{
model
}
2
>&
1
echo
"--------3 save pact quant -------------"
,
$
{
model
}
python
src
/
save_quant_model
.
py
\
--
load_model_path
output_models_pact
/
quant_dygraph
/
$
{
model
}
\
--
save_model_path
int8_models_pact
/
$
{
model
}
>
save_pact_quant_
$
{
model
}
2
>&
1
echo
"--------4 CPU eval after save pact quant -------------"
,
$
{
model
}
python
.
/
src
/
eval
.
py
\
--
model_path
=
.
/
int8_models_pact
/
$
{
model
}
\
--
data_dir
=
$
{
data_path
}
\
--
test_samples
=
$
{
test_samples
}
\
--
batch_size
=
$
{
batch_size
}
>
cpu_eval_after_pact_save_
$
{
model
}
2
>&
1
# fi
done
}
ce_tests_dygraph_ptq
(){
cd
$
{
slim_dir
}
/
ce_tests
/
dygraph
/
quant
||
catchException
ce_tests_dygraph_ptq4
ln
-
s
$
{
slim_dir
}
/
demo
/
data
/
ILSVRC2012
test_samples
=
1000
# if set as -1, use all test samples
data_path
=
'./ILSVRC2012/'
batch_size
=
32
epoch
=
1
output_dir
=
"./output_ptq"
quant_batch_num
=
10
quant_batch_size
=
10
for
model
in
mobilenet_v1
#for model in mobilenet_v1 mobilenet_v2 resnet50 vgg16
do
echo
"--------quantize model: ${model}-------------"
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
# save ptq quant model
python
.
/
src
/
ptq
.
py
\
--
data
=
$
{
data_path
}
\
--
arch
=
$
{
model
}
\
--
quant_batch_num
=
$
{
quant_batch_num
}
\
--
quant_batch_size
=
$
{
quant_batch_size
}
\
--
output_dir
=
$
{
output_dir
}
>
$
{
log_path
}
/
ptq_
$
{
model
}
2
>&
1
print_info
$?
ptq_
$
{
model
}
echo
"-------- eval fp32_infer model -------------"
,
$
{
model
}
python
.
/
src
/
test
.
py
\
--
model_path
=
$
{
output_dir
}
/
$
{
model
}
/
fp32_infer
\
--
data_dir
=
$
{
data_path
}
\
--
batch_size
=
$
{
batch_size
}
\
--
use_gpu
=
True
\
--
test_samples
=
$
{
test_samples
}
\
--
ir_optim
=
False
>
$
{
log_path
}
/
ptq_eval_fp32_
$
{
model
}
2
>&
1
print_info
$?
ptq_eval_fp32_
$
{
model
}
echo
"-------- eval int8_infer model -------------"
,
$
{
model
}
python
.
/
src
/
test
.
py
\
--
model_path
=
$
{
output_dir
}
/
$
{
model
}
/
int8_infer
\
--
data_dir
=
$
{
data_path
}
\
--
batch_size
=
$
{
batch_size
}
\
--
use_gpu
=
False
\
--
test_samples
=
$
{
test_samples
}
\
--
ir_optim
=
False
>
$
{
log_path
}
/
ptq_eval_int8_
$
{
model
}
2
>&
1
print_info
$?
ptq_eval_int8_
$
{
model
}
done
}
#用于更新release分支下无ce_tests_dygraph_ptq case;release分支设置is_develop="False"
is_develop
=
"True"
all_quant
(){
# 10个模型
if
[
"${is_develop}"
==
"True"
];
then
#ce_tests_dygraph_ptq4
ce_tests_dygraph_ptq
fi
demo_quant_quant_aware
# 2个模型
demo_quant_quant_embedding
# 1个模型
demo_quant_quant_post
# 4个策略
demo_dygraph_quant
# 2个模型
demo_quant_pact_quant_aware
# 1个模型
ce_tests_dygraph_qat
# 4个模型
#ce_tests_dygraph_qat4
demo_quant_quant_post_hpo
}
# 3 prune
demo_prune
(){
cd
$
{
slim_dir
}
/
demo
/
prune
||
catchException
demo_prune
# 3.1 P0 prune
if
[
-
d
"models"
];
then
rm
-
rf
models
fi
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
--
model
"MobileNet"
--
pruned_ratio
0.31
--
data
"imagenet"
\
--
pretrained_model
..
/
pretrain
/
MobileNetV1_pretrained
/
--
num_epochs
1
>
$
{
log_path
}
/
prune_v1_T
2
>&
1
print_info
$?
prune_v1_T
#3.2 prune_fpgm
# slim_prune_fpgm_v1_T
# export CUDA_VISIBLE_DEVICES=${cudaid1}
# python train.py \
# --model="MobileNet" \
# --pretrained_model="../pretrain/MobileNetV1_pretrained" \
# --data="imagenet" \
# --pruned_ratio=0.3125 \
# --lr=0.1 \
# --num_epochs=1 \
# --test_period=1 \
# --step_epochs 30 60 90\
# --l2_decay=3e-5 \
# --lr_strategy="piecewise_decay" \
# --criterion="geometry_median" \
# --model_path="./fpgm_mobilenetv1_models" \
# --save_inference True >${log_path}/slim_prune_fpgm_v1_T 2>&1
# print_info $? slim_prune_fpgm_v1_T
#slim_prune_fpgm_v2_T
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
#v2 -50%
python
train
.
py
\
--
model
=
"MobileNetV2"
\
--
pretrained_model
=
"../pretrain/MobileNetV2_pretrained"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.325
\
--
lr
=
0.001
\
--
num_epochs
=
2
\
--
test_period
=
1
\
--
step_epochs
30
60
80
\
--
l2_decay
=
1e-4
\
--
lr_strategy
=
"piecewise_decay"
\
--
criterion
=
"geometry_median"
\
--
model_path
=
"./output/fpgm_mobilenetv2_models"
\
--
save_inference
True
>
$
{
log_path
}
/
slim_prune_fpgm_v2_T
2
>&
1
print_info
$?
slim_prune_fpgm_v2_T
python
eval
.
py
--
model
"MobileNetV2"
--
data
"imagenet"
\
--
model_path
"./output/fpgm_mobilenetv2_models/0"
>
$
{
log_path
}
/
slim_prune_fpgm_v2_eval
2
>&
1
print_info
$?
slim_prune_fpgm_v2_eval
# ResNet34 -50
# export CUDA_VISIBLE_DEVICES=${cudaid1}
# python train.py \
# --model="ResNet34" \
# --pretrained_model="../pretrain/ResNet34_pretrained" \
# --data="imagenet" \
# --pruned_ratio=0.3125 \
# --lr=0.001 \
# --num_epochs=2 \
# --test_period=1 \
# --step_epochs 30 60 \
# --l2_decay=1e-4 \
# --lr_strategy="piecewise_decay" \
# --criterion="geometry_median" \
# --model_path="./output/fpgm_resnet34_50_models" \
# --save_inference True >${log_path}/slim_prune_fpgm_resnet34_50_T 2>&1
print_info
$?
slim_prune_fpgm_resnet34_50_T
python
eval
.
py
--
model
"ResNet34"
--
data
"imagenet"
\
--
model_path
"./output/fpgm_resnet34_50_models/0"
>
$
{
log_path
}
/
slim_prune_fpgm_resnet34_50_eval
2
>&
1
print_info
$?
slim_prune_fpgm_resnet34_50_eval
# ResNet34 -42 slim_prune_fpgm_resnet34_42_T
cd
$
{
slim_dir
}
/
demo
/
prune
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
\
--
model
=
"ResNet34"
\
--
pretrained_model
=
"../pretrain/ResNet34_pretrained"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.25
\
--
num_epochs
=
2
\
--
test_period
=
1
\
--
lr_strategy
=
"cosine_decay"
\
--
criterion
=
"geometry_median"
\
--
model_path
=
"./output/fpgm_resnet34_025_120_models"
\
--
save_inference
True
>
$
{
log_path
}
/
slim_prune_fpgm_resnet34_42_T
2
>&
1
print_info
$?
slim_prune_fpgm_resnet34_42_T
python
eval
.
py
--
model
"ResNet34"
--
data
"imagenet"
\
--
model_path
"./output/fpgm_resnet34_025_120_models/0"
>
$
{
log_path
}
/
slim_prune_fpgm_resnet34_42_eval
2
>&
1
print_info
$?
slim_prune_fpgm_resnet34_42_eval
# 3.3 prune ResNet50
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
# 2.1版本时默认BS=256 会报显存不足,故暂时修改成128
python
train
.
py
--
model
ResNet50
--
pruned_ratio
0.31
--
data
"imagenet"
\
--
save_inference
True
--
pretrained_model
..
/
pretrain
/
ResNet50_pretrained
\
--
num_epochs
1
--
batch_size
128
>
$
{
log_path
}
/
prune_ResNet50_T
2
>&
1
print_info
$?
prune_ResNet50_T
}
# 3.4 dygraph_prune
#dy_prune_ResNet34_f42
demo_dygraph_pruning
(){
cd
$
{
slim_dir
}
/
demo
/
dygraph
/
pruning
||
catchException
demo_dygraph_pruning
ln
-
s
$
{
slim_dir
}
/
demo
/
data
data
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
\
--
use_gpu
=
True
\
--
model
=
"resnet34"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.25
\
--
num_epochs
=
1
\
--
batch_size
=
128
\
--
lr_strategy
=
"cosine_decay"
\
--
criterion
=
"fpgm"
\
--
model_path
=
"./fpgm_resnet34_025_120_models"
>
$
{
log_path
}
/
dy_prune_ResNet34_f42_gpu1
2
>&
1
print_info
$?
dy_prune_ResNet34_f42_gpu1
#2.3 恢复训练 通过设置checkpoint选项进行恢复训练:
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
\
--
use_gpu
=
True
\
--
model
=
"resnet34"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.25
\
--
num_epochs
=
2
\
--
batch_size
=
128
\
--
lr_strategy
=
"cosine_decay"
\
--
criterion
=
"fpgm"
\
--
model_path
=
"./fpgm_resnet34_025_120_models"
\
--
checkpoint
=
"./fpgm_resnet34_025_120_models/0"
>
$
{
log_path
}
/
dy_prune_ResNet34_f42_gpu1_load
2
>&
1
print_info
$?
dy_prune_ResNet34_f42_gpu1_load
#2.4. 评估 通过调用eval.py脚本,对剪裁和重训练后的模型在测试数据上进行精度:
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
eval
.
py
\
--
checkpoint
=
.
/
fpgm_resnet34_025_120_models
/
1
\
--
model
=
"resnet34"
\
--
pruned_ratio
=
0.25
\
--
batch_size
=
128
>
$
{
log_path
}
/
dy_prune_ResNet34_f42_gpu1_eval
2
>&
1
print_info
$?
dy_prune_ResNet34_f42_gpu1_eval
#2.5. 导出模型 执行以下命令导出用于预测的模型:
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
export_model
.
py
\
--
checkpoint
=
.
/
fpgm_resnet34_025_120_models
/
final
\
--
model
=
"resnet34"
\
--
pruned_ratio
=
0.25
\
--
output_path
=
.
/
infer_final
/
resnet
>
$
{
log_path
}
/
dy_prune_ResNet34_f42_gpu1_export
2
>&
1
print_info
$?
dy_prune_ResNet34_f42_gpu1_export
#add dy_prune_fpgm_mobilenetv1_50_T
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
=
"fpgm_mobilenetv1_train_log"
\
train
.
py
\
--
model
=
"mobilenet_v1"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.3125
\
--
lr
=
0.1
\
--
num_epochs
=
1
\
--
test_period
=
1
\
--
step_epochs
30
60
90
\
--
l2_decay
=
3e-5
\
--
lr_strategy
=
"piecewise_decay"
\
--
criterion
=
"fpgm"
\
--
model_path
=
"./fpgm_mobilenetv1_models"
>
$
{
log_path
}
/
dy_prune_fpgm_mobilenetv1_50_T
2
>&
1
print_info
$?
dy_prune_fpgm_mobilenetv1_50_T
#add dy_prune_fpgm_mobilenetv2_50_T
# CUDA_VISIBLE_DEVICES=${cudaid2} python -m paddle.distributed.launch \
# --log_dir="fpgm_mobilenetv2_train_log" \
# train.py \
# --model="mobilenet_v2" \
# --data="imagenet" \
# --pruned_ratio=0.325 \
# --lr=0.001 \
# --num_epochs=1 \
# --test_period=1 \
# --step_epochs 30 60 80\
# --l2_decay=1e-4 \
# --lr_strategy="piecewise_decay" \
# --criterion="fpgm" \
# --model_path="./fpgm_mobilenetv2_models" > ${log_path}/dy_prune_fpgm_mobilenetv2_50_T 2>&1
# print_info $? dy_prune_fpgm_mobilenetv2_50_T
#add
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
=
"fpgm_resnet34_f_42_train_log"
\
train
.
py
\
--
use_gpu
=
True
\
--
model
=
"resnet34"
\
--
data
=
"imagenet"
\
--
pruned_ratio
=
0.25
\
--
batch_size
=
128
\
--
num_epochs
=
1
\
--
test_period
=
1
\
--
lr_strategy
=
"cosine_decay"
\
--
criterion
=
"fpgm"
\
--
model_path
=
"./fpgm_resnet34_025_120_models"
>
$
{
log_path
}
/
dy_prune_ResNet34_f42_gpu2
2
>&
1
print_info
$?
dy_prune_ResNet34_f42_gpu2
}
# 3.5 st unstructured_prune
demo_unstructured_prune
(){
cd
$
{
slim_dir
}
/
demo
/
unstructured_prune
||
catchException
demo_unstructured_prune
# 注意,上述命令中的batch_size为多张卡上总的batch_size,即一张卡的batch_size为256。
## sparsity: -30%, accuracy: 70%/89%
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
\
--
batch_size
256
\
--
pretrained_model
..
/
pretrain
/
MobileNetV1_pretrained
\
--
lr
0.05
\
--
pruning_mode
threshold
\
--
threshold
0.01
\
--
data
imagenet
\
--
lr_strategy
piecewise_decay
\
--
step_epochs
1
2
3
\
--
num_epochs
1
\
--
test_period
1
\
--
model_period
1
\
--
model_path
st_unstructured_models
>
$
{
log_path
}
/
st_unstructured_prune_threshold_T
2
>&
1
print_info
$?
st_unstructured_prune_threshold_T
# eval
python
evaluate
.
py
\
--
pruned_model
=
st_unstructured_models
\
--
data
=
"imagenet"
>
$
{
log_path
}
/
st_unstructured_prune_threshold_eval
2
>&
1
print_info
$?
st_unstructured_prune_threshold_eval
## sparsity: -55%, accuracy: 67%+/87%+
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
\
--
batch_size
256
\
--
pretrained_model
..
/
pretrain
/
MobileNetV1_pretrained
\
--
lr
0.05
\
--
pruning_mode
ratio
\
--
ratio
0.55
\
--
data
imagenet
\
--
lr_strategy
piecewise_decay
\
--
step_epochs
1
2
3
\
--
num_epochs
1
\
--
test_period
1
\
--
model_period
1
\
--
model_path
st_ratio_models
>
$
{
log_path
}
/
st_unstructured_prune_ratio_T
2
>&
1
print_info
$?
st_unstructured_prune_ratio_T
# MNIST数据集
# python train.py \
# --batch_size 256 \
# --pretrained_model ../pretrain/MobileNetV1_pretrained \
# --lr 0.05 \
# --pruning_mode threshold \
# --threshold 0.01 \
# --data mnist \
# --lr_strategy piecewise_decay \
# --step_epochs 1 2 3 \
# --num_epochs 1 \
# --test_period 1 \
# --model_period 1 \
# --model_path st_unstructured_models_mnist >${log_path}/st_unstructured_prune_threshold_mnist_T 2>&1
# print_info $? st_unstructured_prune_threshold_mnist_T
# eval
python
evaluate
.
py
\
--
pruned_model
=
st_unstructured_models_mnist
\
--
data
=
"mnist"
>
$
{
log_path
}
/
st_unstructured_prune_threshold_mnist_eval
2
>&
1
print_info
$?
st_unstructured_prune_threshold_mnist_eval
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
=
"st_unstructured_prune_gmp_log"
\
train
.
py
\
--
batch_size
64
\
--
data
imagenet
\
--
pruning_mode
ratio
\
--
ratio
0.75
\
--
lr
0.005
\
--
model
MobileNet
\
--
num_epochs
1
\
--
test_period
5
\
--
model_period
10
\
--
pretrained_model
..
/
pretrain
/
MobileNetV1_pretrained
\
--
model_path
"./models"
\
--
step_epochs
71
88
\
--
initial_ratio
0.15
\
--
pruning_steps
5
\
--
stable_epochs
0
\
--
pruning_epochs
54
\
--
tunning_epochs
54
\
--
last_epoch
-
1
\
--
prune_params_type
conv1x1_only
\
--
pruning_strategy
gmp
>
$
{
log_path
}
/
st_unstructured_prune_ratio_gmp
2
>&
1
print_info
$?
st_unstructured_prune_ratio_gmp
}
demo_dygraph_unstructured_pruning
(){
# dy_threshold
cd
$
{
slim_dir
}
/
demo
/
dygraph
/
unstructured_pruning
||
catchException
demo_dygraph_unstructured_pruning
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
## sparsity: -55%, accuracy: 67%+/87%+
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
train_dy_ratio_log
train
.
py
\
--
data
imagenet
\
--
lr
0.05
\
--
pruning_mode
ratio
\
--
ratio
0.55
\
--
batch_size
256
\
--
lr_strategy
piecewise_decay
\
--
step_epochs
1
2
3
\
--
num_epochs
1
\
--
test_period
1
\
--
model_period
1
\
--
model_path
dy_ratio_models
>
$
{
log_path
}
/
dy_prune_ratio_T
2
>&
1
print_info
$?
dy_prune_ratio_T
## sparsity: -30%, accuracy: 70%/89%
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
train_dy_threshold_log
train
.
py
\
--
data
imagenet
\
--
lr
0.05
\
--
pruning_mode
threshold
\
--
threshold
0.01
\
--
batch_size
256
\
--
lr_strategy
piecewise_decay
\
--
step_epochs
1
2
3
\
--
num_epochs
1
\
--
test_period
1
\
--
model_period
1
\
--
model_path
dy_threshold_models
>
$
{
log_path
}
/
dy_threshold_prune_T
2
>&
1
print_info
$?
dy_threshold_prune_T
# eval
python
evaluate
.
py
--
pruned_model
dy_threshold_models
/
model
.
pdparams
\
--
data
imagenet
>
$
{
log_path
}
/
dy_threshold_prune_eval
2
>&
1
print_info
$?
dy_threshold_prune_eval
# load
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
train_dy_threshold_load_log
train
.
py
\
--
data
imagenet
\
--
lr
0.05
\
--
pruning_mode
threshold
\
--
threshold
0.01
\
--
batch_size
256
\
--
lr_strategy
piecewise_decay
\
--
step_epochs
1
2
3
\
--
num_epochs
3
\
--
test_period
1
\
--
model_period
1
\
--
model_path
dy_threshold_models_new
\
--
pretrained_model
dy_threshold_models
/
model
.
pdparams
\
--
last_epoch
1
>
$
{
log_path
}
/
dy_threshold_prune_T_load
2
>&
1
print_info
$?
dy_threshold_prune_T_load
# cifar10
# python train.py --data cifar10 --lr 0.05 \
# --pruning_mode threshold \
# --threshold 0.01 \
# --model_period 1 \
# --num_epochs 2 >${log_path}/dy_threshold_prune_cifar10_T 2>&1
# print_info $? dy_threshold_prune_cifar10_T
export
CUDA_VISIBLE_DEVICES
=
$
{
cudaid2
}
python
-
m
paddle
.
distributed
.
launch
\
--
log_dir
=
"dy_unstructured_prune_gmp_log"
\
train
.
py
\
--
batch_size
64
\
--
data
imagenet
\
--
pruning_mode
ratio
\
--
ratio
0.75
\
--
lr
0.005
\
--
num_epochs
1
\
--
test_period
5
\
--
model_period
10
\
--
model_path
"./models"
\
--
step_epochs
71
88
\
--
initial_ratio
0.15
\
--
pruning_steps
100
\
--
stable_epochs
0
\
--
pruning_epochs
54
\
--
tunning_epochs
54
\
--
last_epoch
-
1
\
--
pruning_strategy
gmp
\
--
skip_params_type
exclude_conv1x1
$
{
log_path
}
/
dy_unstructured_prune_ratio_gmp
2
>&
1
print_info
$?
dy_unstructured_prune_ratio_gmp
}
##################
all_prune
(){
# 7个模型
demo_prune
demo_dygraph_pruning
demo_unstructured_prune
# 4个模型
demo_dygraph_unstructured_pruning
}
#4 nas
demo_nas
(){
# 4.1 sa_nas_mobilenetv2
cd
$
{
slim_dir
}
/
demo
/
nas
||
catchException
demo_nas
model
=
demo_nas_sa_nas_v2_T_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
sa_nas_mobilenetv2
.
py
--
search_steps
1
--
port
8881
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
}
demo_nas4
(){
cd
$
{
slim_dir
}
/
demo
/
nas
||
catchException
demo_nas4
model
=
sa_nas_v2_T_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
sa_nas_mobilenetv2
.
py
--
search_steps
1
--
retain_epoch
1
--
port
8881
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
# 4.2 block_sa_nas_mobilenetv2
model
=
block_sa_nas_v2_T_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
block_sa_nas_mobilenetv2
.
py
--
search_steps
1
--
port
8883
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
# 4.3 rl_nas
model
=
rl_nas_v2_T_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
rl_nas_mobilenetv2
.
py
--
search_steps
1
--
port
8885
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
# 4.4 parl_nas
#model=parl_nas_v2_T_1card
#CUDA_VISIBLE_DEVICES=${cudaid1} python parl_nas_mobilenetv2.py \
#--search_steps 1 --port 8887 >${log_path}/${model} 2>&1
#print_info $? ${model}
}
all_nas
(){
# 3 个模型
demo_nas
}
# 5 darts
# search 1card # DARTS一阶近似搜索方法
demo_darts
(){
cd
$
{
slim_dir
}
/
demo
/
darts
||
catchException
demo_darts
model
=
darts1_search_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
search
.
py
--
epochs
1
\
--
use_multiprocess
False
\
--
batch_size
32
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
#train
model
=
pcdarts_train_1card
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train
.
py
--
arch
=
'PC_DARTS'
\
--
epochs
1
--
use_multiprocess
False
\
--
batch_size
32
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
# 可视化
#pip install graphviz
#model=slim_darts_visualize_pcdarts
#python visualize.py PC_DARTS > ${log_path}/${model} 2>&1
#print_info $? ${model}
}
slimfacenet
(){
cd
$
{
slim_dir
}
/
demo
/
slimfacenet
||
catchException
slimfacenet
ln
-
s
$
{
data_path
}
/
slim
/
slimfacenet
/
CASIA
CASIA
ln
-
s
$
{
data_path
}
/
slim
/
slimfacenet
/
lfw
lfw
model
=
slim_slimfacenet_B75_train
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
-
u
train_eval
.
py
\
--
train_data_dir
=
.
/
CASIA
/
--
test_data_dir
=
.
/
lfw
/
\
--
action
train
--
model
=
SlimFaceNet_B_x0_75
\
--
start_epoch
0
--
total_epoch
1
>
$
{
log_path
}
/
slim_slimfacenet_B75_train
2
>&
1
print_info
$?
$
{
model
}
model
=
slim_slimfacenet_B75_quan
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train_eval
.
py
\
--
action
quant
--
train_data_dir
=
.
/
CASIA
/
\
--
test_data_dir
=
.
/
lfw
/
>
$
{
log_path
}
/
slim_slimfacenet_B75_quan
2
>&
1
print_info
$?
$
{
model
}
model
=
slim_slimfacenet_B75_eval
CUDA_VISIBLE_DEVICES
=
$
{
cudaid1
}
python
train_eval
.
py
\
--
action
test
--
train_data_dir
=
.
/
CASIA
/
\
--
test_data_dir
=
.
/
lfw
/
>
$
{
log_path
}
/
slim_slimfacenet_B75_eval
2
>&
1
print_info
$?
$
{
model
}
}
all_darts
(){
# 2个模型
demo_darts
#slimfacenet 需要删掉
}
demo_latency
(){
cd
$
{
slim_dir
}
/
demo
/
analysis
||
catchException
demo_latency
model
=
latency_mobilenet_v1_fp32
python
latency_predictor
.
py
--
model
mobilenet_v1
--
data_type
fp32
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
model
=
latency_mobilenet_v1_int8
python
latency_predictor
.
py
--
model
mobilenet_v1
--
data_type
int8
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
model
=
latency_mobilenet_v2_fp32
python
latency_predictor
.
py
--
model
mobilenet_v2
--
data_type
fp32
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
model
=
latency_mobilenet_v2_int8
python
latency_predictor
.
py
--
model
mobilenet_v2
--
data_type
int8
>
$
{
log_path
}
/
$
{
model
}
2
>&
1
print_info
$?
$
{
model
}
}
all_latency
(){
demo_latency
}
####################################
export
all_case_list
=
(
all_distillation
all_quant
all_prune
all_nas
)
export
all_case_time
=
0
declare
-
A
all_P0case_dic
all_case_dic
=
([
"all_distillation"
]
=
5
[
"all_quant"
]
=
15
[
"all_prune"
]
=
1
[
"all_nas"
]
=
30
[
"all_darts"
]
=
30
[
'unstructured_prune'
]
=
15
[
'dy_qat1'
]
=
1
)
for
key
in
$
(
echo
$
{
!
all_case_dic
[
*
]});
do
all_case_time
=
`expr ${all_case_time} + ${all_case_dic[$key]}`
done
set
-
e
echo
-
e
"
\033
[35m ---- P0case_list length: ${#all_case_list[*]}, cases: ${all_case_list[*]}
\033
[0m"
echo
-
e
"
\033
[35m ---- P0case_time: $all_case_time min
\033
[0m"
set
+
e
####################################
echo
-
e
"
\033
[35m ---- start run case
\033
[0m"
case_num
=
1
for
model
in
$
{
all_case_list
[
*
]};
do
echo
-
e
"
\033
[35m ---- running P0case $case_num/${#all_case_list[*]}: ${model} , task time: ${all_case_list[${model}]} min
\033
[0m"
$
{
model
}
let
case_num
++
done
echo
-
e
"
\033
[35m ---- end run case
\033
[0m"
cd
$
{
slim_dir
}
/
logs
FF
=
`ls *FAIL*|wc -l`
if
[
"${FF}"
-
gt
"0"
];
then
exit
1
else
exit
0
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录