未验证 提交 604c0bbe 编写于 作者: C ceci3 提交者: GitHub

optimization yaml (#1157)

* optimization yaml

* update yaml

* update yaml

* fix unittest

* fix unittest
上级 769c28f5
......@@ -8,27 +8,13 @@ Global:
params_filename: model.pdiparams
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_concat_15.tmp_0
node:
- concat_15.tmp_0
- teacher_concat_14.tmp_0
- concat_14.tmp_0
merge_feed: true
teacher_model_dir: ./ppyoloe_crn_l_300e_coco/
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
Quantization:
use_pact: true
activation_bits: 8
weight_bits: 8
activation_quantize_type: 'range_abs_max'
weight_quantize_type: 'channel_wise_abs_max'
is_full_quantize: false
not_quant_pattern:
- skip_quant
quantize_op_types:
- conv2d
- depthwise_conv2d
......@@ -37,7 +23,8 @@ TrainConfig:
train_iter: 3000
eval_iter: 1000
learning_rate: 0.00001
optimizer: SGD
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 4.0e-05
......@@ -7,33 +7,25 @@ Global:
params_filename: model.pdiparams
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_conv2d_441.tmp_0
alpha: 1.0
loss: l2
node:
- conv2d_441.tmp_0
merge_feed: true
teacher_model_dir: ./tinypose_128x96/
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
Quantization:
activation_bits: 8
is_full_quantize: false
activation_quantize_type: 'range_abs_max'
weight_quantize_type: 'abs_max'
not_quant_pattern:
- skip_quant
quantize_op_types:
- conv2d
- depthwise_conv2d
weight_bits: 8
TrainConfig:
epochs: 1
eval_iter: 1000
learning_rate: 0.0001
optimizer: SGD
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 4.0e-05
#origin_metric: 0.291
......@@ -7,28 +7,15 @@ Global:
params_filename: model.pdiparams
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_conv2d_84.tmp_0
alpha: 1.0
loss: l2
node:
- conv2d_84.tmp_0
- teacher_conv2d_85.tmp_0
- conv2d_85.tmp_0
- teacher_conv2d_86.tmp_0
- conv2d_86.tmp_0
merge_feed: true
teacher_model_dir: ./yolov3_mobilenet_v1_270e_coco/
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
Quantization:
activation_bits: 8
weight_bits: 8
activation_quantize_type: 'range_abs_max'
weight_quantize_type: 'channel_wise_abs_max'
is_full_quantize: false
not_quant_pattern:
- skip_quant
quantize_op_types:
- conv2d
- depthwise_conv2d
......@@ -37,8 +24,9 @@ TrainConfig:
train_iter: 3000
eval_iter: 1000
learning_rate: 0.0001
optimizer: SGD
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 4.0e-05
#origin_metric: 0.289
......@@ -9,29 +9,16 @@ Global:
params_filename: model.pdiparams
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_conv2d_106.tmp_1
alpha: 1.0
loss: l2
node:
- conv2d_106.tmp_1
- teacher_conv2d_113.tmp_1
- conv2d_113.tmp_1
- teacher_conv2d_119.tmp_1
- conv2d_119.tmp_1
merge_feed: true
teacher_model_dir: ./yolov5s_infer/
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
Quantization:
use_pact: true
activation_bits: 8
weight_bits: 8
activation_quantize_type: 'range_abs_max'
weight_quantize_type: 'channel_wise_abs_max'
is_full_quantize: false
not_quant_pattern:
- skip_quant
quantize_op_types:
- conv2d
- depthwise_conv2d
......@@ -40,7 +27,8 @@ TrainConfig:
train_iter: 3000
eval_iter: 1000
learning_rate: 0.00001
optimizer: SGD
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 4.0e-05
target_metric: 0.365
......@@ -13,28 +13,30 @@ Quantization:
weight_bits: 8 # 权重量化比特数
activation_quantize_type: 'range_abs_max' # 激活量化方式
weight_quantize_type: 'channel_wise_abs_max' # 权重量化方式
is_full_quantize: false # 是否全量化
not_quant_pattern: [skip_quant] # 跳过量化层的name_scpoe命名(保持默认即可)
quantize_op_types: [conv2d, depthwise_conv2d] # 量化OP列表
dtype: 'int8' # 量化后的参数类型,默认 int8 , 目前仅支持 int8
window_size: 10000 # 'range_abs_max' 量化方式的 window size ,默认10000。
moving_rate: 0.9 # 'moving_average_abs_max' 量化方式的衰减系数,默认 0.9。
for_tensorrt: false # 量化后的模型是否使用 TensorRT 进行预测。如果是的话,量化op类型为: TENSORRT_OP_TYPES 。默认值为False.
is_full_quantize: false # 是否全量化
```
#### 配置定制蒸馏策略
蒸馏参数主要设置蒸馏节点(`distill_node_pair`)和教师预测模型路径,如下所示:
蒸馏参数主要设置蒸馏节点(`node`)和教师预测模型路径,如下所示:
```yaml
Distillation:
# distill_lambda: distill loss所占权重;可输入多个数值,支持不同节点之间使用不同的lambda值
distill_lambda: 1.0
# distill_loss: 蒸馏loss算法;可输入多个loss,支持不同节点之间使用不同的loss算法
distill_loss: l2_loss
# distill_node_pair: 蒸馏节点,即某层输出的变量名称,需包含教师网络节点和对应的学生网络节点,
# 其中教师网络节点名称将在程序中自动添加 “teacher_” 前缀;
# 可输入多个node_pair,支持多节点蒸馏
distill_node_pair:
- teacher_relu_30.tmp_0
# ahpha: 蒸馏loss所占权重;可输入多个数值,支持不同节点之间使用不同的ahpha值
lambda: 1.0
# loss: 蒸馏loss算法;可输入多个loss,支持不同节点之间使用不同的loss算法
loss: l2
# node: 蒸馏节点,即某层输出的变量名称,可以选择:
# 1. 使用自蒸馏的话,蒸馏结点仅包含学生网络节点即可, 支持多节点蒸馏;
# 2. 使用其他蒸馏的话,蒸馏节点需要包含教师网络节点和对应的学生网络节点,
# 每两个节点组成一对,分别属于教师模型和学生模型,支持多节点蒸馏。
node:
- relu_30.tmp_0
# merge_feed: 若teacher和student的输入相同则为true,若teacher和student的输入不同则为false
merge_feed: true
# teacher_model_dir: 保存预测模型文件和预测模型参数文件的文件夹名称
teacher_model_dir: ./inference_model
# teacher_model_filename: 预测模型文件,格式为 *.pdmodel 或 __model__
......@@ -43,16 +45,14 @@ Distillation:
teacher_params_filename: model.pdiparams
```
- 蒸馏loss目前支持的有:fsp_loss,l2_loss,soft_label_loss,也可自定义loss。具体定义和使用可参考[知识蒸馏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html)
- 蒸馏loss目前支持的有:fsp,l2,soft_label,也可自定义loss。具体定义和使用可参考[知识蒸馏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html)
#### 配置定制结构化稀疏策略
结构化稀疏参数设置如下所示:
```yaml
Prune:
# prune_algo: 裁剪算法
prune_algo: prune
ChannelPrune:
# pruned_ratio: 裁剪比例
pruned_ratio: 0.25
# prune_params_name: 需要裁剪的参数名字
......@@ -61,9 +61,27 @@ Prune:
# criterion: 评估一个卷积层内通道重要性所参考的指标
criterion: l1_norm
```
- prune_algo目前支持的有:prune、asp和transformer_pruner。
- criterion目前支持的有:l1_norm , bn_scale , geometry_median。具体定义和使用可参考[结构化稀疏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/prune/prune_api.html)
#### 配置定制ASP半结构化稀疏策略
半结构化稀疏参数设置如下所示:
```yaml
ASPPrune:
# prune_params_name: 需要裁剪的参数名字
prune_params_name:
- conv1_weights
```
#### 配置定制针对Transformer结构的结构化剪枝策略
针对Transformer结构的结构化剪枝参数设置如下所示:
```yaml
TransformerPrune:
# pruned_ratio: 每个全链接层的裁剪比例
pruned_ratio: 0.25
```
#### 配置定制非结构化稀疏策略
非结构化稀疏参数设置如下所示:
......@@ -73,8 +91,8 @@ UnstructurePrune:
prune_strategy: gmp
# prune_mode: 稀疏化的模式,可设置 'ratio' 或 'threshold'
prune_mode: ratio
# pruned_ratio: 设置稀疏化比例,只有在 prune_mode=='ratio' 时才会生效
pruned_ratio: 0.75
# ratio: 设置稀疏化比例,只有在 prune_mode=='ratio' 时才会生效
ratio: 0.75
# threshold: 设置稀疏化阈值,只有在 prune_mod=='threshold' 时才会生效
threshold: 0.001
# gmp_config: 传入额外的训练超参用以指导GMP训练过程
......@@ -112,9 +130,11 @@ TrainConfig:
epochs: 14
eval_iter: 400
learning_rate: 5.0e-03
optimizer: SGD
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 0.0005
```
- 学习率衰减策略:主要设置策略类名和策略参数,如下所示。目前在paddle中已经实现了多种衰减策略,请参考[lr文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.2/api/paddle/optimizer/lr/LRScheduler_cn.html),策略参数即类初始化参数。
```yaml
......
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_softmax_0.tmp_0
alpha: 1.0
loss: l2
node:
- softmax_0.tmp_0
merge_feed: true
teacher_model_dir: MobileNetV1_infer
teacher_model_filename: inference.pdmodel
teacher_params_filename: inference.pdiparams
......@@ -23,7 +21,8 @@ TrainConfig:
epochs: 1
eval_iter: 500
learning_rate: 0.004
optimizer: Momentum
optim_args:
optimizer_builder:
optimizer:
type: Momentum
weight_decay: 0.00003
origin_metric: 0.70898
\ No newline at end of file
origin_metric: 0.70898
......@@ -115,9 +115,10 @@ TrainConfig:
epochs: 6
eval_iter: 1070
learning_rate: 2.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.7403
```
......
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 6
eval_iter: 1070
learning_rate: 2.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.7403
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 100
eval_iter: 70
learning_rate: 1.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.8421
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 6
eval_iter: 2000
learning_rate: 3.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.8098
\ No newline at end of file
origin_metric: 0.8098
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 16
eval_iter: 1000
learning_rate: 1.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.7736
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 12
eval_iter: 750
learning_rate: 2.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.6021
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 20
eval_iter: 1050
learning_rate: 3.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.7620
\ No newline at end of file
origin_metric: 0.7620
......@@ -2,7 +2,8 @@ TrainConfig:
epochs: 6
eval_iter: 1110
learning_rate: 2.0e-5
optim_args:
optimizer_builder:
optimizer:
type: AdamW
weight_decay: 0.01
optimizer: AdamW
origin_metric: 0.5666
\ No newline at end of file
origin_metric: 0.5666
......@@ -243,8 +243,8 @@ if __name__ == '__main__':
paddle.enable_static()
compress_config, train_config, _ = load_config(args.config_path)
if train_config is not None and 'optim_args' in train_config:
train_config['optim_args'][
if train_config is not None:
train_config.optimizer_builder[
'apply_decay_param_fun'] = apply_decay_param_fun
train_dataloader, eval_dataloader = reader()
......
......@@ -5,7 +5,8 @@ TrainConfig:
epochs: 14
eval_iter: 400
learning_rate: 5.0e-03
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 0.0005
optimizer: SGD
......@@ -2,28 +2,21 @@ Global:
reader_config: configs/pp_humanseg_lite.yaml
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_batch_norm_47.tmp_2
alpha: 1.0
loss: l2
node:
- batch_norm_47.tmp_2
merge_feed: true
teacher_model_dir: ./ppseg_lite_portrait_398x224_with_softmax
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
Quantization:
activation_bits: 8
is_full_quantize: false
not_quant_pattern:
- skip_quant
quantize_op_types:
- conv2d
- depthwise_conv2d
weight_bits: 8
TrainConfig:
epochs: 1
eval_iter: 400
learning_rate: 0.0005
optimizer: SGD
optim_args:
weight_decay: 4.0e-05
optimizer_builder:
optimizer:
type: SGD
weight_decay: 0.0005
......@@ -2,19 +2,15 @@ Global:
reader_config: configs/pp_humanseg_lite.yaml
Distillation:
distill_lambda: 1.0
distill_loss: l2_loss
distill_node_pair:
- teacher_batch_norm_47.tmp_2
alpha: 1.0
loss: l2
node:
- batch_norm_47.tmp_2
merge_feed: true
teacher_model_dir: ./ppseg_lite_portrait_398x224_with_softmax
teacher_model_filename: model.pdmodel
teacher_params_filename: model.pdiparams
UnstructurePrune:
prune_strategy: gmp
prune_mode: ratio
pruned_ratio: 0.75
ratio: 0.75
gmp_config:
stable_iterations: 0
pruning_iterations: 4500
......@@ -24,6 +20,7 @@ UnstructurePrune:
initial_ratio: 0.15
prune_params_type: conv1x1_only
local_sparsity: True
TrainConfig:
epochs: 14
eval_iter: 400
......@@ -31,7 +28,7 @@ TrainConfig:
type: PiecewiseDecay
boundaries: [4500]
values: [0.005, 0.0005]
optim_args:
optimizer_builder:
optimizer:
type: SGD
weight_decay: 0.0005
optimizer: SGD
......@@ -13,7 +13,7 @@ import numpy as np
sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
import models
from utility import add_arguments, print_arguments, _download, _decompress
from paddleslim.dist import merge, l2_loss, soft_label_loss, fsp_loss
from paddleslim.dist import merge, l2, soft_label, fsp
logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
_logger = logging.getLogger(__name__)
......@@ -173,8 +173,8 @@ def compress(args):
merge(teacher_program, student_program, data_name_map, place)
with paddle.static.program_guard(student_program, s_startup):
distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
student_program)
distill_loss = soft_label("teacher_fc_0.tmp_0", "fc_0.tmp_0",
student_program)
loss = avg_cost + distill_loss
lr, opt = create_optimizer(args)
opt.minimize(loss)
......
......@@ -89,7 +89,7 @@ In order to ensure that the data of the teacher network and the student network
data_name_map = {'image': 'image'}
main = slim.dist.merge(teacher_program, student_program, data_name_map, fluid.CPUPlace())
with fluid.program_guard(student_program, student_startup):
l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
l2_loss = slim.dist.l2('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
loss = l2_loss + avg_cost
opt = fluid.optimizer.Momentum(0.01, 0.9)
opt.minimize(loss)
......
......@@ -27,10 +27,12 @@ AutoCompression
目前关键字只支持以下几种组合策略或者单策略配置:
1) ``Quantization`` & ``HyperParameterOptimization``: 离线量化超参搜索策略;
2) ``Quantization`` & ``Distillation``: 量化训练和蒸馏的策略;
3) ``Prune`` & ``Distillation``: 结构化剪枝和蒸馏的策略;
4) ``UnstructurePrune`` & ``Distillation``: 非结构化稀疏和蒸馏的策略;
5) ``Distillation``: 单独单蒸馏策略;
6) ``MultiTeacherDistillation``: 多teacher蒸馏策略。
3) ``ChannelPrune`` & ``Distillation``: 结构化剪枝和蒸馏的策略;
4) ``ASPPrune`` & ``Distillation``: ASP结构化剪枝和蒸馏的策略;
5) ``TransformerPrune`` & ``Distillation``: Transformer结构化剪枝和蒸馏的策略;
6) ``UnstructurePrune`` & ``Distillation``: 非结构化稀疏和蒸馏的策略;
7) ``Distillation``: 单独单蒸馏策略;
8) ``MultiTeacherDistillation``: 多teacher蒸馏策略。
设置为None的话会自动的选择策略去做压缩。默认:None。
- **eval_callback(function, 可选)** - eval回调函数,使用回调函数判断模型训练情况, 回调函数的写法参考: `<//github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/static/auto-compression/custom_function.rst>`_ 。 ``eval_callback`` 和 ``eval_dataloader`` 不能都设置为None。默认:None。
- **eval_dataloader(paddle.io.Dataloader, 可选)** - 如果传入测试数据迭代器,则使用 ``EMD`` 距离判断压缩前后模型之间的差别,目前仅支持离线量化超参搜索使用这种方式判断压缩前后模型的压缩。
......@@ -62,11 +64,11 @@ AutoCompression
default_distill_config = {
"distill_loss": args.distill_loss,
"loss": args.loss,
"distill_node_pair": args.distill_node_pair,
"node": args.node,
"distill_lambda": args.distill_lambda,
"alpha": args.alpha,
"teacher_model_dir": args.teacher_model_dir,
......@@ -84,7 +86,7 @@ AutoCompression
strategy_config="Quantization": Quantization(**default_ptq_config),
"HyperParameterOptimization": HyperParameterOptimization(**default_hpo_config)}, \
"Distillation": HyperParameterOptimization(**default_distill_config)}, \
train_config=None, train_dataloader=train_dataloader, eval_callback=eval_dataloader,devices='gpu')
......@@ -104,12 +106,14 @@ TrainConfig
**参数:**
- **epochs(int)** - 训练的轮数,表明当前数据集需要训练几次。
- **learning_rate(float|LRScheduler)** - 模型优化过程中的学习率。
- **optimizer(str)** - 使用的优化器,需要是 ``paddle.optimizer`` 中优化器的名字, 例如: ``SGD`` 。
- **optim_args(dict)** - 优化器参数。可以指定以下参数:
``grid_clip`` ,指名使用的梯度裁剪的方法,需要是 ``paddle.nn`` 中梯度裁剪的类的名字,例如: ``ClipGradByValue`` 等。
``grad_clip_args`` ,梯度裁剪方法中的参数,例如:梯度裁剪选择的方式为 ``ClipGradByValue`` ,那么 ``grad_clip_args`` 可以设置的参数为 ``max`` 和 ``min`` ,参考: `ClipGradByValue <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/ClipGradByValue_cn.html#clipgradbyvalue>`_ 。
其他优化器中可能需要的参数,例如: ``beta1``, ``beta2``, ``apply_decay_param_fun`` 等,参考: `AdamW <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/optimizer/AdamW_cn.html#adamw>`_ 。
- **train_iter(int, optional)** 训练的迭代次数,表明需要迭代多少批次的数据,和 ``epoch`` 之间仅需要设置一个。
- **learning_rate(float|dict)** - 模型优化过程中的学习率, 如果是dict类型,则dict的关键字如下: ``type``: 学习率策略的类名,可参考 ``paddle.optimizer.lr`` 中的类设置,
其它关键字根据实际调用的学习率的策略中的参数设置。
- **optimizer_builder(dict)** - 使用的优化器和相关配置。dict中对应的关键字如下:
``optimizer(dict)``: 指定关键字 ``type`` 需要是 ``paddle.optimizer`` 中优化器的类名, 例如: ``SGD`` ,其他关键字根据具体使用的优化器中的参数设置。
``weight_decay(float, optional)``: 压缩训练过程中的参数衰退。
``regularizer(dict)``: 指定关键字 ``type`` 需要是 ``paddle.regularizer`` 中的权重衰减正则类名,其他关键字根据具体使用的类中的参数设置。
``grid_clip`` ,指名使用的梯度裁剪的方法,需要是 ``paddle.nn`` 中梯度裁剪的类的名字,例如: ``ClipGradByValue`` 等,其他关键字根据具体使用的类中的参数设置。
- **eval_iter(int)** - 训练多少batch的数据进行一次测试。
- **logging_iter(int)** - 训练多少batch的数据进行一次打印。
......@@ -124,7 +128,7 @@ TrainConfig
参考接口: `amp_config <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp_configs>`_ 来进行相对应的参数配置。
- **recompute_config(dict, optional)** - 使用fleet api的前提下可以使用recompute显存优化逻辑。参数按照fleet 接口中所描述的进行配置: `recompute_configs <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute_configs>`_ 。
- **sharding_config(dict, optional)** - 使用fleet api的前提下可以使用sharding 策略。参数按照fleet 接口中所描述的进行配置: `sharding_configs <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding_configs>`_ 。
- **sparse_model(bool, optional)** - 设置 ``sparse_model`` 为 True, 可以移出非结构化稀疏产出的模型中多余的mask tensor的变量,默认: False.
Quantization
----------
......@@ -147,14 +151,13 @@ Distillation
**参数:**
- **distill_loss(str|list[str])** - 蒸馏损失名字,可以设置的损失类型为paddleslim中支持的蒸馏损失,可选的损失函数有: ``fsp_loss``, ``l2_loss``, ``soft_label_loss`` 。如果您需要其他损失函数,可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算,或者通过提issue的方式我们来协助解决。
- **loss(str|list[str])** - 蒸馏损失名字,可以设置的损失类型为paddleslim中支持的蒸馏损失,可选的损失函数有: ``fsp``, ``l2``, ``soft_label`` 。如果您需要其他损失函数,可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算,或者通过提issue的方式我们来协助解决。
- **distill_node_pair(list[str])** - 蒸馏节点名字列表,每两个节点组成一对,分别属于教师模型和学生模型。
- **distill_lambda(float|list[float])** - 每一个蒸馏损失的权重,长度需要和 ``distill_loss`` 的长度保持一致。
- **node(list[str])** - 蒸馏节点名字列表,可以选择:1. 使用自蒸馏的话,蒸馏结点仅包含学生网络节点即可, 支持多节点蒸馏; 2. 使用其他蒸馏的话,蒸馏节点需要包含教师网络节点和对应的学生网络节点, 每两个节点组成一对,分别属于教师模型和学生模型。
- **alpha(float|list[float])** - 每一个蒸馏损失的权重,长度需要和 ``loss`` 的长度保持一致。
- **teacher_model_dir(str)** - 教师模型的目录。
- **teacher_model_filename(str)** - 教师模型的模型文件名字。
- **teacher_params_filename(str)** - 教师模型的参数文件名字。
- **merge_feed(bool)** - 蒸馏过程是否需要共享同一个输入数据。默认: ``True`` 。
MultiTeacherDistillation
......@@ -164,14 +167,13 @@ MultiTeacherDistillation
**参数:**
- **distill_loss(list[str])** - 蒸馏损失名字,可以设置的损失类型为paddleslim中支持的蒸馏损失,可选的损失函数有: ``fsp_loss``, ``l2_loss``, ``soft_label_loss`` 。如果您需要其他损失函数,可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算,或者通过提issue的方式我们来协助解决。
- **loss(list[str])** - 蒸馏损失名字,可以设置的损失类型为paddleslim中支持的蒸馏损失,可选的损失函数有: ``fsp``, ``l2``, ``soft_label`` 。如果您需要其他损失函数,可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算,或者通过提issue的方式我们来协助解决。
- **distill_node_pair(list[list[str]])** - 蒸馏节点名字嵌套列表,教师模型的个数和外部列表的长度需要保持一致。每一个列表代表一个教师模型和学生模型直接的蒸馏节点,其中每两个节点组成一对,分别属于教师模型和学生模型。
- **distill_lambda(list[float])** - 每一个蒸馏损失的权重,长度需要和 ``distill_loss`` 的长度保持一致。
- **node(list[list[str]])** - 蒸馏节点名字嵌套列表,教师模型的个数和外部列表的长度需要保持一致。每一个列表代表一个教师模型和学生模型直接的蒸馏节点,其中每两个节点组成一对,分别属于教师模型和学生模型。
- **alpha(list[float])** - 每一个蒸馏损失的权重,长度需要和 ``distill_loss`` 的长度保持一致。
- **teacher_model_dir(list[str])** - 教师模型的目录列表。
- **teacher_model_filename(list[str])** - 教师模型的模型文件名字列表。
- **teacher_params_filename(list[str])** - 教师模型的参数文件名字列表。
- **merge_feed(bool)** - 蒸馏过程是否需要共享同一个输入数据。默认: ``True`` 。
HyperParameterOptimization
......
......@@ -49,16 +49,16 @@ merge
data_name_map, place)
fsp_loss
fsp
---------
.. py:function:: paddleslim.dist.fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=None)
.. py:function:: paddleslim.dist.fsp(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=None)
`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L90>`_
为program内的teacher var和student var添加fsp_loss.
为program内的teacher var和student var添加fsp loss.
fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_
fsp loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_
**参数:**
......@@ -70,7 +70,7 @@ fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Net
**返回:**
- (Variable): 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp_loss
- (Variable): 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp loss
**使用示例:**
......@@ -96,15 +96,15 @@ fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Net
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
dist.merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(student_program):
distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
distillation_loss = dist.fsp('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
's1.tmp_1', 's2.tmp_1', student_program)
l2_loss
l2
------------
.. py:function:: paddleslim.dist.l2_loss(teacher_var_name, student_var_name, program=None)
.. py:function:: paddleslim.dist.l2(teacher_var_name, student_var_name, program=None)
`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L118>`_
......@@ -144,15 +144,15 @@ l2_loss
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
dist.merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(student_program):
distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1',
distillation_loss = dist.l2('teacher_t2.tmp_1', 's2.tmp_1',
student_program)
soft_label_loss
soft_label
-------------------
.. py:function:: paddleslim.dist.soft_label_loss(teacher_var_name, student_var_name, program=None, teacher_temperature=1., student_temperature=1.)
.. py:function:: paddleslim.dist.soft_label(teacher_var_name, student_var_name, program=None, teacher_temperature=1., student_temperature=1.)
`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L136>`_
......@@ -170,7 +170,7 @@ soft_label_loss出自论文 `Distilling the Knowledge in a Neural Network <https
**返回:**
- (Variable): 由teacher_var, student_var组合得到的soft_label_loss
- (Variable): 由teacher_var, student_var组合得到的soft label loss
**使用示例:**
......
......@@ -92,7 +92,7 @@ merge操作将student_program和teacher_program中的所有Tensor和Op都将被
data_name_map = {'image': 'image'}
main = slim.dist.merge(teacher_program, student_program, data_name_map, paddle.CPUPlace())
with paddle.static.program_guard(student_program, student_startup):
l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
l2_loss = slim.dist.l2('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
loss = l2_loss + avg_cost
opt = paddle.optimizer.Momentum(0.01, 0.9)
opt.minimize(loss)
......
......@@ -92,7 +92,6 @@ def create_strategy_config(strategy_str, model_type):
### default prune config
default_prune_config = {
'pruned_ratio': float(tmp_s[1]),
'prune_algo': 'prune',
'criterion': 'l1_norm'
}
else:
......@@ -105,10 +104,12 @@ def create_strategy_config(strategy_str, model_type):
'local_sparsity': True,
'prune_params_type': 'conv1x1_only'
}
tmp_s[0] = tmp_s[0].replace('prune', 'Prune')
if model_type == 'transformer':
tmp_s[0] = tmp_s[0].replace('prune', 'TransformerPrune')
default_prune_config = {'pruned_ratio': float(tmp_s[1])}
else:
tmp_s[0] = tmp_s[0].replace('prune', 'Prune')
tmp_s[0] = tmp_s[0].replace('sparse', 'UnstructurePrune')
if model_type == 'transformer' and tmp_s[0] == 'Prune':
default_prune_config['prune_algo'] = 'transformer_pruner'
prune_config = eval(tmp_s[0])(**default_prune_config)
configs.append({tmp_s[0]: prune_config, 'Distillation': dis_config})
......
......@@ -82,15 +82,21 @@ class AutoCompression:
2. set ``Quantization`` and ``HyperParameterOptimization`` to get quant_post and hyperparameter optimization compress config.
The Quantization config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L24`_ .
The HyperParameterOptimization config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L73`_ .
3. set ``Prune`` and ``Distillation`` to get prune and distillation compress config.
The Prune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
3. set ``ChannelPrune`` and ``Distillation`` to get channel prune and distillation compress config.
The ChannelPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
4. set ``UnstructurePrune`` and ``Distillation`` to get unstructureprune and distillation compress config.
4. set ``ASPPrune`` and ``Distillation`` to get asp prune and distillation compress config.
The ASPPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
5. set ``TransformerPrune`` and ``Distillation`` to get transformer prune and distillation compress config.
The TransformerPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
6. set ``UnstructurePrune`` and ``Distillation`` to get unstructureprune and distillation compress config.
The UnstructurePrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L91`_ .
The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
5. set ``Distillation`` to use one teacher modol to distillation student model.
7. set ``Distillation`` to use one teacher modol to distillation student model.
The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
6. set ``MultiTeacherDistillation`` to use multi-teacher to distillation student model.
8. set ``MultiTeacherDistillation`` to use multi-teacher to distillation student model.
The MultiTeacherDistillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L56`_ .
If set to None, will choose a strategy automatically. Default: None.
......@@ -149,6 +155,8 @@ class AutoCompression:
self._strategy, self._config = self._prepare_strategy(
self.strategy_config)
#print(self._strategy, self._config[0].__dict__)
#sys.exit()
# If train_config is None, set default train_config
if self.train_config is None:
......@@ -189,14 +197,15 @@ class AutoCompression:
for strategy_c in strategy_config:
quant_config = strategy_c.get("Quantization", None)
hpo_config = strategy_c.get("HyperParameterOptimization", None)
prune_config = strategy_c.get("Prune", None)
prune_config = strategy_c.get("ChannelPrune", None)
asp_config = strategy_c.get("ASPPrune", None)
transformer_prune_config = strategy_c.get("TransformerPrune", None)
unstructure_prune_config = strategy_c.get("UnstructurePrune", None)
single_teacher_distill_config = strategy_c.get("Distillation", None)
if single_teacher_distill_config is not None and single_teacher_distill_config.teacher_model_dir is None:
single_teacher_distill_config = single_teacher_distill_config._replace(
teacher_model_dir=self.model_dir,
teacher_model_filename=self.model_filename,
teacher_params_filename=self.params_filename)
single_teacher_distill_config.teacher_model_dir = self.model_dir
single_teacher_distill_config.teacher_model_filename = self.model_filename
single_teacher_distill_config.teacher_params_filename = self.params_filename
multi_teacher_distill_config = strategy_c.get(
"MultiTeacherDistillation", None)
......@@ -219,17 +228,29 @@ class AutoCompression:
### case3: prune_config & distill config
elif prune_config is not None and self._distill_config is not None:
strategy.append('prune_dis')
strategy.append('channel_prune_dis')
config.append(merge_config(prune_config, self._distill_config))
### case4: unstructure_config & distill config
### case4: asp_config & distill config
elif asp_config is not None and self._distill_config is not None:
strategy.append('asp_prune_dis')
config.append(merge_config(asp_config, self._distill_config))
### case5: transformer_prune_config & distill config
elif transformer_prune_config is not None and self._distill_config is not None:
strategy.append('transformer_prune_dis')
config.append(
merge_config(transformer_prune_config,
self._distill_config))
### case6: unstructure_config & distill config
elif unstructure_prune_config is not None and self._distill_config is not None:
strategy.append('unstructure_prune_dis')
config.append(
merge_config(unstructure_prune_config,
self._distill_config))
### case4: distill_config
### case7: distill_config
elif self._distill_config is not None:
if single_teacher_distill_config is not None:
strategy.append('single_teacher_dis')
......@@ -272,7 +293,7 @@ class AutoCompression:
train_program_info = ProgramInfo(startup_program, train_program,
feed_target_names, fetch_targets)
config_dict = dict(config._asdict())
config_dict = config.__dict__
if "prune_strategy" in config_dict and config_dict[
"prune_strategy"] == "gmp" and config_dict[
'gmp_config'] is None:
......@@ -313,7 +334,7 @@ class AutoCompression:
self._exe,
self._places,
config_dict,
self.train_config._asdict(),
self.train_config.__dict__,
train_program_info,
pruner=self._pruner,
dist_strategy=dist_strategy,
......@@ -345,7 +366,7 @@ class AutoCompression:
train_program_info.optimizer.amp_init(
self._places, scope=paddle.static.global_scope())
if 'prune_algo' in config_dict and config_dict['prune_algo'] == 'asp':
if 'asp' in strategy:
### prune weight in scope
self._pruner.prune_model(train_program_info.program)
......
......@@ -33,7 +33,7 @@ def load_config(config_path):
compress_config = {}
for key, value in cfg.items():
default_key = eval(key)(**value)
default_key = eval(key)(**value) if value is not None else eval(key)()
compress_config[key] = default_key
if compress_config.get('TrainConfig') != None:
......
......@@ -17,6 +17,7 @@ import numpy as np
import paddle
import paddle.distributed.fleet as fleet
import paddle.optimizer as optimizer
import paddle.regularizer as regularizer
from ..quant.quanter import quant_aware, _quant_config_default, _parse_configs, pact, get_pact_optimizer
from ..dist import *
from ..common.recover_program import recover_inference_program, _remove_fetch_node
......@@ -44,37 +45,73 @@ def _create_lr_scheduler(train_config):
def _create_optimizer(train_config):
"""create optimizer"""
opt = getattr(optimizer, train_config.get('optimizer') or
'SGD') ### default optimizer is SGD
if 'optim_args' in train_config:
if train_config[
'optim_args'] is not None and 'grad_clip' in train_config[
'optim_args'] and train_config['optim_args'][
'grad_clip'] is not None:
grad_clip = getattr(
paddle.nn, train_config['optim_args']['grad_clip'])(
**train_config['optim_args']['grad_clip_args'])
train_config['optim_args'].pop('grad_clip')
train_config['optim_args'].pop('grad_clip_args')
else:
grad_clip = None
if 'grad_clip' in train_config['optim_args'] and train_config[
'optim_args']['grad_clip'] is None:
train_config['optim_args'].pop('grad_clip')
train_config['optim_args'].pop('grad_clip_args')
if 'optimizer_builder' not in train_config:
train_config['optimizer_builder'] = {'optimizer': {'type': 'SGD'}}
optimizer_builder = train_config['optimizer_builder']
if 'grad_clip' in optimizer_builder:
g_clip_params = optimizer_builder['grad_clip']
g_clip_type = g_clip_params.pop('type')
grad_clip = getattr(paddle.nn, g_clip_type)(**g_clip_params)
else:
train_config['optim_args'] = {}
grad_clip = None
### build regularization
if 'regularizer' in optimizer_builder:
reg_params = optimizer_builder['regularizer']
reg_type = reg_params.pop('type')
reg = getattr(regularizer, reg_type)(**reg_params)
elif 'weight_decay' in optimizer_builder:
reg = optimizer_builder.pop('weight_decay')
else:
reg = None
### build learning rate
lr = _create_lr_scheduler(train_config)
op = opt(learning_rate=lr,
grad_clip=grad_clip,
**train_config['optim_args'])
return op, lr
### build optimizer
optim_params = optimizer_builder['optimizer']
optim_type = optim_params.pop('type')
opt = getattr(optimizer, optim_type)(learning_rate=lr,
grad_clip=grad_clip,
weight_decay=reg,
**optim_params)
return opt, lr
def _get_distill_node(student_program, config):
node = config.get('node')
if len(node) == 0:
return None
### the type of node is list or list(list)
if isinstance(node[0], list):
test_node = node[0][0]
else:
test_node = node[0]
try:
test_var = student_program.global_block().var(test_node)
distill_node_pair = []
if isinstance(node[0], list):
for n_list in node:
tmp_node_pair = []
for n in n_list:
tmp_node_pair.append('teacher_' + n)
tmp_node_pair.append(n)
distill_node_pair.append(tmp_node_pair)
else:
for n in node:
distill_node_pair.append('teacher_' + n)
distill_node_pair.append(n)
return distill_node_pair
except:
return node
def _parse_distill_loss(distill_node_pair,
distill_loss='l2_loss',
distill_loss='l2',
distill_lambda=1.0):
"""parse distill loss config"""
loss_dist = 0.0
......@@ -135,9 +172,9 @@ def _load_program_and_merge(executor,
data_name_map = {}
if 'merge_feed' not in config or config['merge_feed'] == True:
assert len(feed_target_names) == len(teacher_feed_target_names), \
"the number of feed nodes in the teacher model is not equal to the student model"
merge_feed = (
sorted(feed_target_names) == sorted(teacher_feed_target_names))
if merge_feed == True:
for i, name in enumerate(feed_target_names):
data_name_map[teacher_feed_target_names[i]] = name
......@@ -153,7 +190,7 @@ def _load_program_and_merge(executor,
place,
teacher_scope=new_scope,
name_prefix=teacher_name_prefix,
merge_feed=config.get('merge_feed') or True)
merge_feed=merge_feed)
if teacher_idx == None or teacher_idx == 1:
return train_program, test_program, data_name_map
else:
......@@ -180,6 +217,9 @@ def build_distill_program(executor,
feed_target_names = train_program_info.feed_target_names
fetch_targets = train_program_info.fetch_targets
distill_node_pair = _get_distill_node(train_program,
config) or default_distill_node_pair
teacher_model_dir = config[
"teacher_model_dir"] if "teacher_model_dir" in config else config[
"teacher_model_path_prefix"]
......@@ -270,16 +310,15 @@ def build_distill_program(executor,
**train_config['amp_config'])
distill_loss, losses = _parse_distill_loss(
config.get('distill_node_pair') or default_distill_node_pair,
config.get('distill_loss') or
'l2_loss', ### default loss is l2_loss
config.get('distill_lambda') or 1.0) ### default lambda is 1.0
distill_node_pair,
config.get('loss') or 'l2', ### default loss is l2
config.get('alpha') or 1.0) ### default alpha is 1.0
loss = paddle.mean(distill_loss)
loss.stop_gradient = False
if 'prune_algo' in config: ### prune & asp
if config['prune_algo'] == 'asp' and not train_config.get(
'use_fleet'):
if 'prune_params_name' in config: ### prune
if 'pruned_ratio' not in config and not train_config.get(
'use_fleet'): ### asp
optimizer = pruner.decorate(optimizer)
optimizer.minimize(loss)
elif 'prune_strategy' in config: ###unstructure prune
......@@ -302,11 +341,8 @@ def build_quant_program(executor, place, config, train_program_info,
scope = paddle.static.global_scope()
assert isinstance(config, dict), "quant config must be dict"
default_config = _quant_config_default
default_config.update(config)
config = _parse_configs(default_config)
use_pact = config["use_pact"]
use_pact = config.pop("use_pact")
if use_pact:
act_preprocess_func = pact
optimizer_func = get_pact_optimizer
......@@ -364,13 +400,13 @@ def build_prune_program(executor,
strategy,
patterns,
eval_dataloader=None):
if 'unstructure' in strategy:
if strategy.startswith('unstructure'):
from ..prune.unstructured_pruner import UnstructuredPruner, GMPUnstructuredPruner
if config["prune_strategy"] is None:
pruner = UnstructuredPruner(
train_program_info.program,
mode=config['prune_mode'],
ratio=config['pruned_ratio'],
ratio=config['ratio'],
threshold=config['threshold'],
prune_params_type=config['prune_params_type'],
place=place,
......@@ -378,69 +414,65 @@ def build_prune_program(executor,
elif config["prune_strategy"] == "gmp":
pruner = GMPUnstructuredPruner(
train_program_info.program,
ratio=config['pruned_ratio'],
ratio=config['ratio'],
prune_params_type=config['prune_params_type'],
place=place,
local_sparsity=config['local_sparsity'],
configs=config['gmp_config'])
elif strategy.startswith('channel_prune'):
from ..prune import Pruner
pruner = Pruner(config["criterion"])
params = []
### TODO(ceci3): set default prune weight
for param in train_program_info.program.global_block().all_parameters():
if config['prune_params_name'] is not None and param.name in config[
'prune_params_name']:
params.append(param.name)
pruned_program, _, _ = pruner.prune(
train_program_info.program,
paddle.static.global_scope(),
params=params,
ratios=[config['pruned_ratio']] * len(params),
place=place)
train_program_info.program = pruned_program
elif strategy.startswith('asp'):
from paddle.static import sparsity
pruner = sparsity
excluded_params_name = []
### TODO(ceci3): set default prune weight
for param in train_program_info.program.global_block().all_parameters():
if config[
'prune_params_name'] is not None and param.name not in config[
'prune_params_name']:
excluded_params_name.append(param.name)
if "teacher_" in param.name:
excluded_params_name.append(param.name)
pruner.set_excluded_layers(train_program_info.program,
excluded_params_name)
elif strategy.startswith('transformer_prune'):
from .transformer_pruner import TransformerPruner
assert eval_dataloader is not None, "transformer_pruner must set eval_dataloader"
label_info = _get_label_info(eval_dataloader,
train_program_info.feed_target_names)
assert len(label_info) != 0, \
"maybe something wrong in get label name from eval_dataloader, please check your eval_dataloader"
pruner = TransformerPruner(
executor,
place,
train_program_info.program,
patterns,
label_info,
width_mult=(1.0 - config['pruned_ratio']),
dataloader=eval_dataloader,
fetch_targets=train_program_info.fetch_targets)
pruned_program = pruner.prune()
train_program_info.program = pruned_program
else:
if config['prune_algo'] == 'prune':
from ..prune import Pruner
pruner = Pruner(config["criterion"])
params = []
### TODO(ceci3): set default prune weight
for param in train_program_info.program.global_block(
).all_parameters():
if config[
'prune_params_name'] is not None and param.name in config[
'prune_params_name']:
params.append(param.name)
pruned_program, _, _ = pruner.prune(
train_program_info.program,
paddle.static.global_scope(),
params=params,
ratios=[config['pruned_ratio']] * len(params),
place=place)
train_program_info.program = pruned_program
elif config['prune_algo'] == 'asp':
from paddle.static import sparsity
pruner = sparsity
excluded_params_name = []
### TODO(ceci3): set default prune weight
for param in train_program_info.program.global_block(
).all_parameters():
if config[
'prune_params_name'] is not None and param.name not in config[
'prune_params_name']:
excluded_params_name.append(param.name)
if "teacher_" in param.name:
excluded_params_name.append(param.name)
pruner.set_excluded_layers(train_program_info.program,
excluded_params_name)
elif config['prune_algo'] == 'transformer_pruner':
from .transformer_pruner import TransformerPruner
assert eval_dataloader is not None, "transformer_pruner must set eval_dataloader"
label_info = _get_label_info(eval_dataloader,
train_program_info.feed_target_names)
assert len(label_info) != 0, \
"maybe something wrong in get label name from eval_dataloader, please check your eval_dataloader"
pruner = TransformerPruner(
executor,
place,
train_program_info.program,
patterns,
label_info,
width_mult=(1.0 - config['pruned_ratio']),
dataloader=eval_dataloader,
fetch_targets=train_program_info.fetch_targets)
pruned_program = pruner.prune()
train_program_info.program = pruned_program
else:
raise NotImplementedError(
"prune_algo must be choice in [\"prune\", \"asp\"], {} is not support".
format(config['prune_algo']))
raise NotImplementedError(
"prune_algo must be choice in [\"prune\", \"asp\"], {} is not support".
format(config['prune_algo']))
return pruner, train_program_info
......
......@@ -16,122 +16,301 @@ from collections import namedtuple
__all__ = [
"Quantization", "Distillation", "MultiTeacherDistillation", \
"HyperParameterOptimization", "Prune", "UnstructurePrune", \
"merge_config", "ProgramInfo", "TrainConfig",
"HyperParameterOptimization", "ChannelPrune", "UnstructurePrune", \
"TransformerPrune", "ASPPrune", "merge_config", "ProgramInfo", "TrainConfig",
]
### Quantization:
Quantization = namedtuple(
"Quantization",
[
"quantize_op_types",
"weight_bits",
"activation_bits",
"not_quant_pattern", # Only support in QAT
"use_pact", # Only support in QAT
"is_full_quantize",
"activation_quantize_type",
"weight_quantize_type"
])
Quantization.__new__.__defaults__ = (None, ) * (
len(Quantization._fields) - 3) + (False, 'moving_average_abs_max',
'channel_wise_abs_max')
### Distillation:
Distillation = namedtuple(
"Distillation",
[
"distill_loss", ### list[list],支持不同节点之间使用不同的loss。
"distill_node_pair", ### list[list],支持不同节点之间使用不同的loss。
"distill_lambda", ### list[list],支持不同节点之间使用不同的loss。
"teacher_model_dir",
"teacher_model_filename",
"teacher_params_filename",
"merge_feed",
])
Distillation.__new__.__defaults__ = (None, ) * (len(Distillation._fields) - 1
) + (True, )
### 多teacher蒸馏配置
### Multi-Teacher Distillation:
MultiTeacherDistillation = namedtuple(
"MultiTeacherDistillation",
[
"distill_loss", ### list[str],每个teacher对应一个loss
"distill_node_pair", ### list[list],每个teacher对应一个蒸馏。仅支持logits蒸馏,不支持中间层蒸馏
"distill_lambda", ### list[float],每个teacher对应一个lambda。
"teacher_model_dir",
"teacher_model_filename", ### list[str], 每个teacher对应一个模型文件
"teacher_params_filename", ### list[str], 每个teacher对应一个参数文件
"merge_feed",
])
MultiTeacherDistillation.__new__.__defaults__ = (None, ) * (
len(MultiTeacherDistillation._fields) - 1) + (True, )
### 不设置就按照默认的搜索空间进行超参搜索,设置的话按照设置的搜索空间搜索,这样可以支持单PTQ策略
###HyperParameterOptimization
HyperParameterOptimization = namedtuple("HyperParameterOptimization", [
"ptq_algo", "bias_correct", "weight_quantize_type", "hist_percent",
"batch_num", "max_quant_count"
])
HyperParameterOptimization.__new__.__defaults__ = (None, ) * (
len(HyperParameterOptimization._fields) - 1) + (20, )
### Prune
Prune = namedtuple("Prune", [
"prune_algo",
"pruned_ratio",
"prune_params_name",
"criterion",
])
Prune.__new__.__defaults__ = (None, ) * len(Prune._fields)
### UnstructurePrune
UnstructurePrune = namedtuple("UnstructurePrune", [
"prune_strategy",
"prune_mode",
"threshold",
"pruned_ratio",
"gmp_config",
"prune_params_type",
"local_sparsity",
])
UnstructurePrune.__new__.__defaults__ = (None, ) * len(UnstructurePrune._fields)
### Train
TrainConfig = namedtuple(
"Train",
[
"epochs", # Training total epoch
"train_iter", # Training total iteration, `epochs` or `train_iter` only need to set one.
"learning_rate",
"optimizer",
"optim_args",
"eval_iter",
"logging_iter",
"origin_metric",
"target_metric",
"use_fleet",
"amp_config",
"recompute_config",
"sharding_config",
"sparse_model",
])
TrainConfig.__new__.__defaults__ = (None, ) * len(TrainConfig._fields)
class Quantization:
def __init__(self,
quantize_op_types=[
'conv2d', 'depthwise_conv2d', 'mul', 'matmul', 'matmul_v2'
],
weight_bits=8,
activation_bits=8,
not_quant_pattern=['skip_quant'],
use_pact=False,
activation_quantize_type='moving_average_abs_max',
weight_quantize_type='channel_wise_abs_max',
dtype='int8',
window_size=10000,
moving_rate=0.9,
for_tensorrt=False,
is_full_quantize=False):
"""
Quantization Config.
Args:
quantize_op_types(list(str)): Ops of type in quantize_op_types, will be quantized. Default: ['conv2d', 'depthwise_conv2d', 'mul', 'matmul', 'matmul_v2'].
weight_bits(int): Weight quantize bit num. Default: 8.
activation_bits(int): Activation quantize bit num. Default 8.
not_quant_pattern(list(str)): Ops of name_scope in not_quant_pattern list, will not be quantized. Default: 'skip_quant'.
use_pact(bool): Whether to use pact in quantization training. Default: False.
activation_quantize_type(str): Activation quantize type. Default is 'moving_average_abs_max'.
weight_quantize_type(str): Weight quantize type. Default 'channel_wise_abs_max'.
dtype(str): Data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'.
window_size(int): Window size for 'range_abs_max' quantization. Default: 10000.
moving_rate(float): The decay coefficient of moving average. Default: 0.9.
for_tensorrt(bool): If True, 'quantize_op_types' will be TENSORRT_OP_TYPES. Default: False.
is_full_quantize(bool): If True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES. Default: False.
"""
self.quantize_op_types = quantize_op_types
self.weight_bits = weight_bits
self.activation_bits = activation_bits
self.not_quant_pattern = not_quant_pattern
self.use_pact = use_pact
self.is_full_quantize = is_full_quantize
self.activation_quantize_type = activation_quantize_type
self.weight_quantize_type = weight_quantize_type
self.dtype = dtype
self.window_size = window_size
self.moving_rate = moving_rate
self.for_tensorrt = for_tensorrt
self.is_full_quantize = is_full_quantize
class Distillation:
def __init__(self,
loss='l2',
node=[],
alpha=1.0,
teacher_model_dir=None,
teacher_model_filename=None,
teacher_params_filename=None):
"""
Distillation Config.
Args:
loss(str|list(str)): Distillation loss, the type of loss can be set reference `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html>`_. If set list of loss, means the difference node can be set difference distill loss, the length of loss must equal to length of node. Default: 'l2'.
node(list(str)|list(list(str))): Distillation node, users can set node from the model before compress. If set list of list, every inner list used same distill loss, the length of list must equal to length of loss. Default: [].
alpha(float|list(float)): The lambda of distillation loss. If set list of alpha, the length of alpha must equal to length of loss. Default: 1.0.
teacher_model_dir(str, optional): The path of teacher inference model, and the model and params that saved by ``paddle.static.io.save_inference_model`` are under the path. If set to None, the teacher model will be set to the model before compress. Default: None.
teacher_model_filename(str, optional): The name of teacher model file. If parameters are saved in separate files, set it as 'None'. Default: 'None'.
teacher_params_filename(str, optional): The name of teacher params file. When all parameters are saved in a single file, set it as filename. If parameters are saved in separate files, set it as 'None'. Default : 'None'.
"""
self.loss = loss
self.node = node
self.alpha = alpha
self.teacher_model_dir = teacher_model_dir
self.teacher_model_filename = teacher_model_filename
self.teacher_params_filename = teacher_params_filename
class MultiTeacherDistillation:
def __init__(self,
loss=[],
node=[],
alpha=[],
teacher_model_dir=[],
teacher_model_filename=[],
teacher_params_filename=[]):
"""
Multi-Teacher Distillation Config.
Args:
loss(list(str)): The list of distillation loss, the type of loss can be set reference `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html>`_. One-to-one correspondence between loss and teacher model. Default: [].
node(list(list(str))): Distillation node, users can set node from the model before compress. If set list of list, every inner list used same distill loss, the length of list must equal to length of loss. Default: [].
alpha(list(float)): The list of lambda of distillation loss. One-to-one correspondence between alpha and loss. Default: [].
teacher_model_dir(list): The list of path of teacher inference model, and the model and params that saved by ``paddle.static.io.save_inference_model`` are under the path. If set to None, the teacher model will be set to the model before compress. Default: None.
teacher_model_filename(list): The list of name of teacher model file. If parameters are saved in separate files, set it as 'None'. Default: 'None'.
teacher_params_filename(list): The list of name of teacher params fie. When all parameters are saved in a single file, set it as filename. If parameters are saved in separate files, set it as 'None'. Default : 'None'.
"""
self.loss = loss
self.node = node
self.alpha = alpha
self.teacher_model_dir = teacher_model_dir
self.teacher_model_filename = teacher_model_filename
self.teacher_params_filename = teacher_params_filename
class HyperParameterOptimization:
def __init__(self,
ptq_algo=["KL", "hist", "avg", "mse"],
bias_correct=[True, False],
weight_quantize_type=['channel_wise_abs_max'],
hist_percent=[0.98, 0.999],
batch_num=[10, 30],
max_quant_count=20):
"""
HyperParameterOptimization Config.
Args:
ptq_algo(list(str)): Post-Training Quantization algorithm, can be set reference the algo from `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/quant/quantization_api.html#quant-post-static>`_.
bias_correct(list(bool)): Whether to use bias_correct.
weight_quantize_type(list(str)): Quantization type for weight, can be set from 'channel_abs_max' or 'abs_max'.
hist_percent(list(float)): The upper and lower bounds of threshold of algo 'hist' for activations, the real percent is uniform sampling in this bounds.
batch_num(list(int)): The upper and lower bounds of batch number, the real batch number is uniform sampling in this bounds.
max_quant_count(int): Max number of model quantization. Default: 20.
"""
self.ptq_algo = ptq_algo
self.bias_correct = bias_correct
self.weight_quantize_type = weight_quantize_type
self.hist_percent = hist_percent
self.batch_num = batch_num
self.max_quant_count = max_quant_count
class ChannelPrune:
def __init__(self, pruned_ratio, prune_params_name, criterion='l1_norm'):
"""
ChannelPrune Config.
Args:
pruned_ratio(float): The ratios to be pruned.
prune_params_name(list(str)): A list of parameter names to be pruned.
criterion(str|function): the criterion used to sort channels for pruning, can be choose from ['l1_norm', 'bn_scale', 'geometry_median']. Default: 'l1_norm'.
"""
self.pruned_ratio = pruned_ratio
self.prune_params_name = prune_params_name
self.criterion = criterion
class ASPPrune:
def __init__(self, prune_params_name):
"""
ASPPrune Config.
Args:
prune_params_name(list(str)): A list of parameter names to be pruned.
"""
self.prune_params_name = prune_params_name
class TransformerPrune:
def __init__(self, pruned_ratio):
"""
TransformerPrune Config.
Args:
pruned_ratio(float): The ratios to be pruned each fully-connected layer.
"""
self.pruned_ratio = pruned_ratio
class UnstructurePrune:
def __init__(self,
prune_strategy=None,
prune_mode='ratio',
threshold=0.01,
ratio=0.55,
gmp_config=None,
prune_params_type=None,
local_sparsity=False):
"""
UnstructurePrune Config.
Args:
prune_strategy(str, optional): The pruning strategy, currently we support base and gmp, ``None`` means use base pruning strategy. Default: ``None``.
prune_mode(str): The pruning mode: whether by ratio or by threshold. Default: 'ratio'.
threshold(float): The threshold to set zeros, the abs(weights) lower than which will be zeros. Default: 0.01.
ratio(float): The ratio to set zeros, the smaller portion will be zeros. Default: 0.55.
gmp_config(dict): The dictionary contains all the configs for GMP pruner. Default: None. The detailed description is as below:
.. code-block:: python
{'stable_iterations': int} # the duration of stable phase in terms of global iterations
{'pruning_iterations': int} # the duration of pruning phase in terms of global iterations
{'tunning_iterations': int} # the duration of tunning phase in terms of global iterations
{'resume_iteration': int} # the start timestamp you want to train from, in terms if global iteration
{'pruning_steps': int} # the total times you want to increase the ratio
{'initial_ratio': float} # the initial ratio value
..
prune_params_type(str): Which kind of params should be pruned, we only support None (all but norms) and conv1x1_only for now. Default: None.
local_sparsity(bool): Whether to prune all the parameter matrix at the same ratio or not. Default: False.
"""
self.prune_strategy = prune_strategy
self.prune_mode = prune_mode
self.threshold = threshold
self.ratio = ratio
self.gmp_config = gmp_config
self.prune_params_type = prune_params_type
self.local_sparsity = local_sparsity
class TrainConfig:
def __init__(self,
epochs=None,
train_iter=None,
learning_rate=0.02,
optimizer_builder={'optimizer': 'SGD'},
eval_iter=1000,
logging_iter=10,
origin_metric=None,
target_metric=None,
use_fleet=False,
amp_config=None,
recompute_config=None,
sharding_config=None,
sparse_model=False):
"""
Train Config.
Args:
epochs(int): The number of total epochs. Default: None.
train_iter(int): Training total iteration, `epochs` or `train_iter` only need to set one. Default: None.
learning_rate(float|dict): learning rate in the training. If set dict, the detailed description of learning_rate is as blow:
.. code-block:: python
'type'(str) # the class name of learning rate decay, can reference in paddle.optimizer.lr.
..
other keys in the learning_rate depend on the parameters in the class of learning rate decay.
Such as, if you want to use ``PiecewiseDecay``, need to set learning_rate like:
{'type': PiecewiseDecay, 'boundaries': [4500], 'values': [0.005, 0.0005]}.
optimizer_builder(str|dict): optimizer in th training. If set dict, the detailed description of optimizer_builder is as blow:
.. code-block:: python
'optimizer'(dict) # the 'type' in the optimizer need to be the class name in the paddle.optimizer,
other key of optimzer depend on the parameters in the class.
'weight_decay(float, optional)' # weight decay in the training.
'regularizer(dict)': # the 'type' in the regularizer need to be the class name in the paddle.regularizer,
other key of optimzer depend on the parameters in the class.
'grad_clip(dict)': # the 'type' in the grad_clip need to be the class name in the paddle.nn, such as: 'ClipGradByGlobalNorm',
other key of grad_clip depend on the parameters in the class.
..
eval_iter(int): Test period in batches. Default: 1000.
logging_iter(int): Log period in batches. Default: 10.
origin_metric(float, optional): The Metric of model before compress, used to check whether the dataloader is correct if is not None. Default: None.
target_metric(float, optional): The Metric of model after compress, if set target metric, the metric of compressed model satisfy the requirements, will be stop training. If not set, will train epochs as users set. Default: None.
use_fleet(bool): Whether to use fleet. Default: False.
amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below if use_fleet=False:
.. code-block:: python
AMP-O1 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id2>`_ :
{'custom_white_list', set} # The custom white_list. It's the set of ops that support
fp16 calculation and are considered numerically-safe and performance-critical. These ops
will be converted to fp16.
{'custom_black_list': set} # The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
{'custom_black_varnames': set} # Users' custom black varibles' names.
AMP-O2 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id3>`_ :
{'use_pure_fp16': bool} # Whether to use the pure fp16 training.
{'use_fp16_guard': bool} # Whether to use `fp16_guard` when constructing the program.
..
If you want to use AMP-O2, you need to set use_pure_fp16 is True and use_fp16_guard is False.
If use_fleet=True, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_.
recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when use_fleet=True, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_.
sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when use_fleet=True, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_.
sparse_model(bool, optional): Set sparse_model to ``True`` to remove mask tensor when the compress strategy is unstructure prune. Default: False.
"""
self.epochs = epochs
self.train_iter = train_iter
self.learning_rate = learning_rate
self.optimizer_builder = optimizer_builder
self.eval_iter = eval_iter
self.logging_iter = logging_iter
self.origin_metric = origin_metric
self.target_metric = target_metric
self.use_fleet = use_fleet
self.amp_config = amp_config
self.recompute_config = recompute_config
self.sharding_config = sharding_config
self.sparse_model = sparse_model
class MergeConfig:
def __init__(self, **kwargs):
for name, value in kwargs.items():
setattr(self, name, value)
def merge_config(*args):
fields = set()
cfg = dict()
for arg in args:
fields = fields.union(arg._fields)
cfg.update(dict(arg._asdict()))
MergeConfig = namedtuple("MergeConfig", fields)
cfg.update(arg.__dict__)
return MergeConfig(**cfg)
......@@ -143,6 +322,16 @@ class ProgramInfo:
fetch_targets,
optimizer=None,
learning_rate=None):
"""
ProgramInfo Config.
Args:
startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#cn-api-fluid-default-startup-program>`_.
program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#cn-api-fluid-default-main-program>`_.
feed_target_names(list(str)): The name of feed tensor in the program.
fetch_targets(list(Variable)): The fetch variable in the program.
optimizer(Optimizer, optional): Optimizer in training. Default: None.
learning_rate(float|paddle.optimizer.lr, optional): learning_rate in training. Default: None.
"""
self.startup_program = startup_program
self.program = program
self.feed_target_names = feed_target_names
......
......@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .single_distiller import merge, fsp_loss, l2_loss, soft_label_loss, loss
from .single_distiller import merge, fsp, l2, soft_label, loss
from .dml import DML
......@@ -54,7 +54,8 @@ def merge(teacher_program,
teacher_program = teacher_program.clone(for_test=True)
for teacher_var in teacher_program.list_vars():
skip_rename = False
if teacher_var.name != 'fetch' and (not merge_feed or teacher_var.name != 'feed'):
if teacher_var.name != 'fetch' and (not merge_feed or
teacher_var.name != 'feed'):
if teacher_var.name in data_name_map.keys():
new_name = data_name_map[teacher_var.name]
if new_name == teacher_var.name:
......@@ -72,7 +73,8 @@ def merge(teacher_program,
teacher_var.name, new_name)
for teacher_var in teacher_program.list_vars():
if teacher_var.name != 'fetch' and (not merge_feed or teacher_var.name != 'feed'):
if teacher_var.name != 'fetch' and (not merge_feed or
teacher_var.name != 'feed'):
# student program add var
new_var = student_program.global_block()._clone_variable(
teacher_var, force_persistable=False)
......@@ -111,11 +113,11 @@ def merge(teacher_program,
op._op._set_attr("skip_quant", True)
def fsp_loss(teacher_var1_name,
teacher_var2_name,
student_var1_name,
student_var2_name,
program=None):
def fsp(teacher_var1_name,
teacher_var2_name,
student_var1_name,
student_var2_name,
program=None):
"""Combine variables from student model and teacher model by fsp-loss.
Args:
......@@ -149,7 +151,7 @@ def fsp_loss(teacher_var1_name,
return fsp_loss
def l2_loss(teacher_var_name, student_var_name, program=None):
def l2(teacher_var_name, student_var_name, program=None):
"""Combine variables from student model and teacher model by l2-loss.
Args:
......@@ -170,11 +172,11 @@ def l2_loss(teacher_var_name, student_var_name, program=None):
return l2_loss
def soft_label_loss(teacher_var_name,
student_var_name,
program=None,
teacher_temperature=1.,
student_temperature=1.):
def soft_label(teacher_var_name,
student_var_name,
program=None,
teacher_temperature=1.,
student_temperature=1.):
"""Combine variables from student model and teacher model by soft-label-loss.
Args:
......
......@@ -27,7 +27,7 @@ import paddle.fluid as fluid
from ..common.recover_program import recover_inference_program
from .quanter import _quant_config_default, _parse_configs, pact, get_pact_optimizer
from .quanter import quant_aware, convert
from ..dist import merge, l2_loss, soft_label_loss, fsp_loss
from ..dist import merge, l2, soft_label, fsp
from ..auto_compression.create_compressed_program import build_distill_program
import logging
logging.getLogger().setLevel(logging.INFO)
......@@ -57,7 +57,7 @@ _train_config_default = {
and the teacher node and student node are arranged in pairs.
for example, ["teacher_fc_0.tmp_0", "fc_0.tmp_0", "teacher_batch_norm_24.tmp_4", "batch_norm_24.tmp_4"]
"""
"distill_node_pair": None
"node": None
}
......@@ -91,12 +91,10 @@ def _parse_train_configs(train_config):
"'teacher_model_path_prefix' must both be string"
assert isinstance(configs['model_path_prefix'], str), \
"'model_path_prefix' must both be str"
assert isinstance(configs['distill_node_pair'], list), \
"'distill_node_pair' must both be list"
assert len(configs['distill_node_pair']) > 0, \
"'distill_node_pair' not configured with distillation nodes"
assert len(configs['distill_node_pair']) % 2 == 0, \
"'distill_node_pair' distillation nodes need to be configured in pairs"
assert isinstance(configs['node'], list), \
"'node' must both be list"
assert len(configs['node']) > 0, \
"'node' not configured with distillation nodes"
return train_config
......@@ -143,7 +141,7 @@ def quant_aware_with_infermodel(executor,
train_config(dict):train aware configs, include num_epoch, save_iter_step, learning_rate,
weight_decay, use_pact, quant_model_ckpt_path,
model_path_prefix, teacher_model_path_prefix,
distill_node_pair(teacher_node_name1, node_name1, teacher_node_name2, teacher_node_name2, ...)
node(node_name1, node_name2, ...)
test_callback(callback function): callback function include two params: compiled test quant program and checkpoint save filename.
user can implement test logic.
Returns:
......@@ -261,7 +259,7 @@ def export_quant_infermodel(
train_config(dict):train aware configs, include num_epoch, save_iter_step, learning_rate,
weight_decay, use_pact, quant_model_ckpt_path,
model_path_prefix, teacher_model_path_prefix,
distill_node_pair(teacher_node_name1, node_name1, teacher_node_name2, teacher_node_name2, ...)
node(node_name1, node_name2, ...)
checkpoint_path(str): checkpoint path need to export quant infer model.
export_inference_model_path_prefix(str): export infer model path prefix, storage directory of model + model name (excluding suffix).
Returns:
......
......@@ -15,7 +15,7 @@ import sys
sys.path.append("../")
import unittest
import paddle
from paddleslim.dist import merge, fsp_loss
from paddleslim.dist import merge, fsp
from layers import conv_bn_layer
from static_case import StaticCase
......@@ -49,9 +49,8 @@ class TestFSPLoss(StaticCase):
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
merged_ops.append(op.type)
distill_loss = fsp_loss('teacher_conv1_out.tmp_1',
'teacher_conv6_out.tmp_0', 'conv1_out.tmp_0',
'conv2_out.tmp_0')
distill_loss = fsp('teacher_conv1_out.tmp_1', 'teacher_conv6_out.tmp_0',
'conv1_out.tmp_0', 'conv2_out.tmp_0')
loss_ops = []
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
......
......@@ -16,7 +16,7 @@ sys.path.append("../")
import unittest
import paddle
from static_case import StaticCase
from paddleslim.dist import merge, l2_loss
from paddleslim.dist import merge, l2
from layers import conv_bn_layer
......@@ -48,8 +48,8 @@ class TestL2Loss(StaticCase):
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
merged_ops.append(op.type)
distill_loss = l2_loss('teacher_conv6_bn_output.tmp_2',
'conv2_bn_output.tmp_2')
distill_loss = l2('teacher_conv6_bn_output.tmp_2',
'conv2_bn_output.tmp_2')
loss_ops = []
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
......
......@@ -17,6 +17,7 @@ sys.path.append("../")
sys.path.append(".")
sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
import unittest
import copy
import paddle
from paddleslim.quant import quant_aware, convert
from paddleslim.quant import quant_aware_with_infermodel, export_quant_infermodel
......@@ -145,13 +146,10 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
"./quantaware_with_infermodel_checkpoints/",
"teacher_model_path_prefix": float_infer_model_path_prefix,
"model_path_prefix": float_infer_model_path_prefix,
"distill_node_pair": [
"teacher_fc_0.tmp_0", "fc_0.tmp_0",
"teacher_batch_norm_24.tmp_4", "batch_norm_24.tmp_4",
"teacher_batch_norm_22.tmp_4", "batch_norm_22.tmp_4",
"teacher_batch_norm_18.tmp_4", "batch_norm_18.tmp_4",
"teacher_batch_norm_13.tmp_4", "batch_norm_13.tmp_4",
"teacher_batch_norm_5.tmp_4", "batch_norm_5.tmp_4"
"node": [
"fc_0.tmp_0", "batch_norm_24.tmp_4", "batch_norm_22.tmp_4",
"batch_norm_18.tmp_4", "batch_norm_13.tmp_4",
"batch_norm_5.tmp_4"
]
}
......@@ -184,7 +182,7 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
scope=None,
train_reader=train_loader,
quant_config=quant_config,
train_config=train_config,
train_config=copy.deepcopy(train_config),
test_callback=test_callback)
def test_export_quant_infermodel(exe, place, checkpoint_path,
......@@ -194,7 +192,7 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
place,
scope=None,
quant_config=quant_config,
train_config=train_config,
train_config=copy.deepcopy(train_config),
checkpoint_path=checkpoint_path,
export_inference_model_path_prefix=quant_infermodel_save_path)
......
......@@ -15,7 +15,7 @@ import sys
sys.path.append("../")
import unittest
import paddle
from paddleslim.dist import merge, soft_label_loss
from paddleslim.dist import merge, soft_label
from layers import conv_bn_layer
from static_case import StaticCase
......@@ -48,8 +48,8 @@ class TestSoftLabelLoss(StaticCase):
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
merged_ops.append(op.type)
distill_loss = soft_label_loss('teacher_conv6_bn_output.tmp_2',
'conv2_bn_output.tmp_2')
distill_loss = soft_label('teacher_conv6_bn_output.tmp_2',
'conv2_bn_output.tmp_2')
loss_ops = []
for block in paddle.static.default_main_program().blocks:
for op in block.ops:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册