optimization yaml (#1157)

* optimization yaml * update yaml * update yaml * fix unittest * fix unittest

optimization yaml (#1157)
* optimization yaml * update yaml * update yaml * fix unittest * fix unittest
604c0bbe · ceci3 · GitHub · 769c28f5 · 604c0bbe · 604c0bbe
35 changed file
--- a/demo/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
+++ b/demo/auto_compression/detection/configs/ppyoloe_l_qat_dis.yaml
@@ -8,27 +8,13 @@ Global:
  params_filename: model.pdiparams

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_concat_15.tmp_0
+  node:
  - concat_15.tmp_0
-  - teacher_concat_14.tmp_0
  - concat_14.tmp_0
-  merge_feed: true
-  teacher_model_dir: ./ppyoloe_crn_l_300e_coco/
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams

 Quantization:
  use_pact: true
-  activation_bits: 8
-  weight_bits: 8
  activation_quantize_type: 'range_abs_max'
-  weight_quantize_type: 'channel_wise_abs_max'
-  is_full_quantize: false
-  not_quant_pattern:
-  - skip_quant
  quantize_op_types:
  - conv2d
  - depthwise_conv2d
@@ -37,7 +23,8 @@ TrainConfig:
  train_iter: 3000
  eval_iter: 1000
  learning_rate: 0.00001
-  optimizer: SGD
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 4.0e-05

--- a/demo/auto_compression/detection/configs/tinypose_qat_dis.yaml
+++ b/demo/auto_compression/detection/configs/tinypose_qat_dis.yaml
@@ -7,33 +7,25 @@ Global:
  params_filename: model.pdiparams

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_conv2d_441.tmp_0
+  alpha: 1.0
+  loss: l2
+  node:
  -  conv2d_441.tmp_0
-  merge_feed: true
-  teacher_model_dir: ./tinypose_128x96/
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams

 Quantization:
-  activation_bits: 8
-  is_full_quantize: false
  activation_quantize_type: 'range_abs_max'
  weight_quantize_type: 'abs_max'
-  not_quant_pattern:
-  - skip_quant
  quantize_op_types:
  - conv2d
  - depthwise_conv2d
-  weight_bits: 8
+
 TrainConfig:
  epochs: 1
  eval_iter: 1000
  learning_rate: 0.0001
-  optimizer: SGD
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 4.0e-05
  #origin_metric: 0.291

--- a/demo/auto_compression/detection/configs/yolov3_mbv1_qat_dis.yaml
+++ b/demo/auto_compression/detection/configs/yolov3_mbv1_qat_dis.yaml
@@ -7,28 +7,15 @@ Global:
  params_filename: model.pdiparams

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_conv2d_84.tmp_0
+  alpha: 1.0
+  loss: l2
+  node:
  - conv2d_84.tmp_0
-  - teacher_conv2d_85.tmp_0
  - conv2d_85.tmp_0
-  - teacher_conv2d_86.tmp_0
  - conv2d_86.tmp_0
-  merge_feed: true
-  teacher_model_dir: ./yolov3_mobilenet_v1_270e_coco/
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams

 Quantization:
-  activation_bits: 8
-  weight_bits: 8
  activation_quantize_type: 'range_abs_max'
-  weight_quantize_type: 'channel_wise_abs_max'
-  is_full_quantize: false
-  not_quant_pattern:
-  - skip_quant
  quantize_op_types:
  - conv2d
  - depthwise_conv2d
@@ -37,8 +24,9 @@ TrainConfig:
  train_iter: 3000
  eval_iter: 1000
  learning_rate: 0.0001
-  optimizer: SGD
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 4.0e-05
  #origin_metric: 0.289

--- a/demo/auto_compression/detection/configs/yolov5s_qat_dis.yaml
+++ b/demo/auto_compression/detection/configs/yolov5s_qat_dis.yaml
@@ -9,29 +9,16 @@ Global:
  params_filename: model.pdiparams

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_conv2d_106.tmp_1
+  alpha: 1.0
+  loss: l2
+  node:
  - conv2d_106.tmp_1
-  - teacher_conv2d_113.tmp_1
  - conv2d_113.tmp_1
-  - teacher_conv2d_119.tmp_1
  - conv2d_119.tmp_1
-  merge_feed: true
-  teacher_model_dir: ./yolov5s_infer/
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams

 Quantization:
  use_pact: true
-  activation_bits: 8
-  weight_bits: 8
  activation_quantize_type: 'range_abs_max'
-  weight_quantize_type: 'channel_wise_abs_max'
-  is_full_quantize: false
-  not_quant_pattern:
-  - skip_quant
  quantize_op_types:
  - conv2d
  - depthwise_conv2d
@@ -40,7 +27,8 @@ TrainConfig:
  train_iter: 3000
  eval_iter: 1000
  learning_rate: 0.00001
-  optimizer: SGD
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 4.0e-05
  target_metric: 0.365
--- a/demo/auto_compression/hyperparameter_tutorial.md
+++ b/demo/auto_compression/hyperparameter_tutorial.md
@@ -13,28 +13,30 @@ Quantization:
    weight_bits: 8                                # 权重量化比特数
    activation_quantize_type: 'range_abs_max'     # 激活量化方式
    weight_quantize_type: 'channel_wise_abs_max'  # 权重量化方式
-    is_full_quantize: false                       # 是否全量化
    not_quant_pattern: [skip_quant]               # 跳过量化层的name_scpoe命名(保持默认即可)
    quantize_op_types: [conv2d, depthwise_conv2d] # 量化OP列表
+    dtype: 'int8'                                 # 量化后的参数类型，默认 int8 , 目前仅支持 int8
+    window_size: 10000                            # 'range_abs_max' 量化方式的 window size ，默认10000。
+    moving_rate: 0.9                              # 'moving_average_abs_max' 量化方式的衰减系数，默认 0.9。
+    for_tensorrt: false                           # 量化后的模型是否使用 TensorRT 进行预测。如果是的话，量化op类型为： TENSORRT_OP_TYPES 。默认值为False.
+    is_full_quantize: false                       # 是否全量化
 ```

 #### 配置定制蒸馏策略

-蒸馏参数主要设置蒸馏节点（`distill_node_pair`）和教师预测模型路径，如下所示：
+蒸馏参数主要设置蒸馏节点（`node`）和教师预测模型路径，如下所示：
 ```yaml
 Distillation:
-    # distill_lambda: distill loss所占权重；可输入多个数值，支持不同节点之间使用不同的lambda值
-    distill_lambda: 1.0
-    # distill_loss: 蒸馏loss算法；可输入多个loss，支持不同节点之间使用不同的loss算法
-    distill_loss: l2_loss
-    # distill_node_pair: 蒸馏节点，即某层输出的变量名称，需包含教师网络节点和对应的学生网络节点，
-    #                    其中教师网络节点名称将在程序中自动添加 “teacher_” 前缀；
-    #                    可输入多个node_pair，支持多节点蒸馏
-    distill_node_pair:
-    - teacher_relu_30.tmp_0
+    # ahpha: 蒸馏loss所占权重；可输入多个数值，支持不同节点之间使用不同的ahpha值
+    lambda: 1.0
+    # loss: 蒸馏loss算法；可输入多个loss，支持不同节点之间使用不同的loss算法
+    loss: l2
+    # node: 蒸馏节点，即某层输出的变量名称，可以选择：
+    #                    1. 使用自蒸馏的话，蒸馏结点仅包含学生网络节点即可, 支持多节点蒸馏;
+    #                    2. 使用其他蒸馏的话，蒸馏节点需要包含教师网络节点和对应的学生网络节点,
+    #                    每两个节点组成一对，分别属于教师模型和学生模型，支持多节点蒸馏。
+    node:
    - relu_30.tmp_0
-    # merge_feed: 若teacher和student的输入相同则为true，若teacher和student的输入不同则为false
-    merge_feed: true
    # teacher_model_dir: 保存预测模型文件和预测模型参数文件的文件夹名称
    teacher_model_dir: ./inference_model
    # teacher_model_filename: 预测模型文件，格式为 *.pdmodel 或 __model__
@@ -43,16 +45,14 @@ Distillation:
    teacher_params_filename: model.pdiparams
 ```

- 蒸馏loss目前支持的有：fsp_loss，l2_loss，soft_label_loss，也可自定义loss。具体定义和使用可参考[知识蒸馏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html)。
+- 蒸馏loss目前支持的有：fsp，l2，soft_label，也可自定义loss。具体定义和使用可参考[知识蒸馏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html)。


 #### 配置定制结构化稀疏策略

 结构化稀疏参数设置如下所示：
 ```yaml
-Prune:
-  # prune_algo: 裁剪算法
-  prune_algo: prune
+ChannelPrune:
  # pruned_ratio: 裁剪比例
  pruned_ratio: 0.25
  # prune_params_name: 需要裁剪的参数名字
@@ -61,9 +61,27 @@ Prune:
  # criterion: 评估一个卷积层内通道重要性所参考的指标
  criterion: l1_norm
 ```
- prune_algo目前支持的有：prune、asp和transformer_pruner。
 - criterion目前支持的有：l1_norm , bn_scale , geometry_median。具体定义和使用可参考[结构化稀疏API文档](https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/prune/prune_api.html)。

+#### 配置定制ASP半结构化稀疏策略
+
+半结构化稀疏参数设置如下所示：
+```yaml
+ASPPrune:
+  # prune_params_name: 需要裁剪的参数名字
+  prune_params_name:
+  - conv1_weights
+```
+
+#### 配置定制针对Transformer结构的结构化剪枝策略
+
+针对Transformer结构的结构化剪枝参数设置如下所示：
+```yaml
+TransformerPrune:
+  # pruned_ratio: 每个全链接层的裁剪比例
+  pruned_ratio: 0.25
+```
+
 #### 配置定制非结构化稀疏策略

 非结构化稀疏参数设置如下所示：
@@ -73,8 +91,8 @@ UnstructurePrune:
    prune_strategy: gmp
    # prune_mode: 稀疏化的模式，可设置 'ratio' 或 'threshold'
    prune_mode: ratio
-    # pruned_ratio: 设置稀疏化比例，只有在 prune_mode=='ratio' 时才会生效
-    pruned_ratio: 0.75
+    # ratio: 设置稀疏化比例，只有在 prune_mode=='ratio' 时才会生效
+    ratio: 0.75
    # threshold: 设置稀疏化阈值，只有在 prune_mod=='threshold' 时才会生效
    threshold: 0.001
    # gmp_config: 传入额外的训练超参用以指导GMP训练过程
@@ -112,9 +130,11 @@ TrainConfig:
  epochs: 14
  eval_iter: 400
  learning_rate: 5.0e-03
-  optimizer: SGD
-  optim_args:
+  optimizer_builder:
+    optimizer:
+      type: SGD
    weight_decay: 0.0005
+
 ```
 - 学习率衰减策略：主要设置策略类名和策略参数，如下所示。目前在paddle中已经实现了多种衰减策略，请参考[lr文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.2/api/paddle/optimizer/lr/LRScheduler_cn.html)，策略参数即类初始化参数。
 ```yaml

--- a/demo/auto_compression/image_classification/configs/mobilenetv1_qat_dis.yaml
+++ b/demo/auto_compression/image_classification/configs/mobilenetv1_qat_dis.yaml
 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_softmax_0.tmp_0
+  alpha: 1.0
+  loss: l2
+  node:
  - softmax_0.tmp_0
-  merge_feed: true
  teacher_model_dir: MobileNetV1_infer
  teacher_model_filename: inference.pdmodel
  teacher_params_filename: inference.pdiparams
@@ -23,7 +21,8 @@ TrainConfig:
  epochs: 1
  eval_iter: 500
  learning_rate: 0.004
-  optimizer: Momentum
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: Momentum
    weight_decay: 0.00003
-  origin_metric: 0.70898
\ No newline at end of file
+  origin_metric: 0.70898
--- a/demo/auto_compression/nlp/README.md
+++ b/demo/auto_compression/nlp/README.md
@@ -115,9 +115,10 @@ TrainConfig:
  epochs: 6
  eval_iter: 1070
  learning_rate: 2.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer:
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
  origin_metric: 0.7403
 ```


--- a/demo/auto_compression/nlp/configs/afqmc.yaml
+++ b/demo/auto_compression/nlp/configs/afqmc.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 6
  eval_iter: 1070
  learning_rate: 2.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
  origin_metric: 0.7403
--- a/demo/auto_compression/nlp/configs/cluewsc.yaml
+++ b/demo/auto_compression/nlp/configs/cluewsc.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 100
  eval_iter: 70
  learning_rate: 1.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
  origin_metric: 0.8421
--- a/demo/auto_compression/nlp/configs/cmnli.yaml
+++ b/demo/auto_compression/nlp/configs/cmnli.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 6
  eval_iter: 2000
  learning_rate: 3.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
-  origin_metric: 0.8098
\ No newline at end of file
+  origin_metric: 0.8098
--- a/demo/auto_compression/nlp/configs/csl.yaml
+++ b/demo/auto_compression/nlp/configs/csl.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 16
  eval_iter: 1000
  learning_rate: 1.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
  origin_metric: 0.7736
--- a/demo/auto_compression/nlp/configs/iflytek.yaml
+++ b/demo/auto_compression/nlp/configs/iflytek.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 12
  eval_iter: 750
  learning_rate: 2.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
  origin_metric: 0.6021
--- a/demo/auto_compression/nlp/configs/ocnli.yaml
+++ b/demo/auto_compression/nlp/configs/ocnli.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 20
  eval_iter: 1050
  learning_rate: 3.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
-  origin_metric: 0.7620
\ No newline at end of file
+  origin_metric: 0.7620
--- a/demo/auto_compression/nlp/configs/tnews.yaml
+++ b/demo/auto_compression/nlp/configs/tnews.yaml
@@ -2,7 +2,8 @@ TrainConfig:
  epochs: 6
  eval_iter: 1110
  learning_rate: 2.0e-5
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: AdamW
    weight_decay: 0.01
-  optimizer: AdamW
-  origin_metric: 0.5666
\ No newline at end of file
+  origin_metric: 0.5666
--- a/demo/auto_compression/nlp/run.py
+++ b/demo/auto_compression/nlp/run.py
@@ -243,8 +243,8 @@ if __name__ == '__main__':
    paddle.enable_static()

    compress_config, train_config, _ = load_config(args.config_path)
-    if train_config is not None and 'optim_args' in train_config:
-        train_config['optim_args'][
+    if train_config is not None:
+        train_config.optimizer_builder[
            'apply_decay_param_fun'] = apply_decay_param_fun

    train_dataloader, eval_dataloader = reader()

--- a/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_auto.yaml
+++ b/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_auto.yaml
@@ -5,7 +5,8 @@ TrainConfig:
  epochs: 14
  eval_iter: 400
  learning_rate: 5.0e-03
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 0.0005
-  optimizer: SGD
  
--- a/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_quant_dis.yaml
+++ b/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_quant_dis.yaml
@@ -2,28 +2,21 @@ Global:
  reader_config: configs/pp_humanseg_lite.yaml

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair:
-  - teacher_batch_norm_47.tmp_2
+  alpha: 1.0
+  loss: l2
+  node:
  - batch_norm_47.tmp_2
-  merge_feed: true
-  teacher_model_dir: ./ppseg_lite_portrait_398x224_with_softmax
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams
+
 Quantization:
-  activation_bits: 8
-  is_full_quantize: false
-  not_quant_pattern:
-  - skip_quant
  quantize_op_types:
  - conv2d
  - depthwise_conv2d
-  weight_bits: 8
+
 TrainConfig:
  epochs: 1
  eval_iter: 400
  learning_rate: 0.0005
-  optimizer: SGD
-  optim_args:
-    weight_decay: 4.0e-05
+  optimizer_builder:
+    optimizer: 
+      type: SGD
+    weight_decay: 0.0005
--- a/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_sparse_dis.yaml
+++ b/demo/auto_compression/semantic_segmentation/configs/pp_humanseg_sparse_dis.yaml
@@ -2,19 +2,15 @@ Global:
  reader_config: configs/pp_humanseg_lite.yaml

 Distillation:
-  distill_lambda: 1.0
-  distill_loss: l2_loss
-  distill_node_pair: 
-  - teacher_batch_norm_47.tmp_2
+  alpha: 1.0
+  loss: l2
+  node: 
  - batch_norm_47.tmp_2
-  merge_feed: true
-  teacher_model_dir: ./ppseg_lite_portrait_398x224_with_softmax
-  teacher_model_filename: model.pdmodel
-  teacher_params_filename: model.pdiparams
+
 UnstructurePrune:
  prune_strategy: gmp
  prune_mode: ratio
-  pruned_ratio: 0.75
+  ratio: 0.75
  gmp_config: 
    stable_iterations: 0
    pruning_iterations: 4500
@@ -24,6 +20,7 @@ UnstructurePrune:
    initial_ratio: 0.15
  prune_params_type: conv1x1_only
  local_sparsity: True
+
 TrainConfig:
  epochs: 14
  eval_iter: 400
@@ -31,7 +28,7 @@ TrainConfig:
    type: PiecewiseDecay
    boundaries: [4500]
    values: [0.005, 0.0005]
-  optim_args:
+  optimizer_builder:
+    optimizer: 
+      type: SGD
    weight_decay: 0.0005
-  optimizer: SGD
-  
--- a/demo/distillation/distill.py
+++ b/demo/distillation/distill.py
@@ -13,7 +13,7 @@ import numpy as np
 sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
 import models
 from utility import add_arguments, print_arguments, _download, _decompress
-from paddleslim.dist import merge, l2_loss, soft_label_loss, fsp_loss
+from paddleslim.dist import merge, l2, soft_label, fsp

 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
@@ -173,8 +173,8 @@ def compress(args):
    merge(teacher_program, student_program, data_name_map, place)

    with paddle.static.program_guard(student_program, s_startup):
-        distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
-                                       student_program)
+        distill_loss = soft_label("teacher_fc_0.tmp_0", "fc_0.tmp_0",
+                                  student_program)
        loss = avg_cost + distill_loss
        lr, opt = create_optimizer(args)
        opt.minimize(loss)

--- a/docs/en/quick_start/distillation_tutorial_en.md
+++ b/docs/en/quick_start/distillation_tutorial_en.md
@@ -89,7 +89,7 @@ In order to ensure that the data of the teacher network and the student network
 data_name_map = {'image': 'image'}
 main = slim.dist.merge(teacher_program, student_program, data_name_map, fluid.CPUPlace())
 with fluid.program_guard(student_program, student_startup):
-    l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
+    l2_loss = slim.dist.l2('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
    loss = l2_loss + avg_cost
    opt = fluid.optimizer.Momentum(0.01, 0.9)
    opt.minimize(loss)

--- a/docs/zh_cn/api_cn/static/auto-compression/auto_compression_api.rst
+++ b/docs/zh_cn/api_cn/static/auto-compression/auto_compression_api.rst
@@ -27,10 +27,12 @@ AutoCompression
            目前关键字只支持以下几种组合策略或者单策略配置:
                         1) ``Quantization`` & ``HyperParameterOptimization``: 离线量化超参搜索策略;
                         2) ``Quantization`` & ``Distillation``: 量化训练和蒸馏的策略;
-                         3) ``Prune`` & ``Distillation``: 结构化剪枝和蒸馏的策略;
-                         4) ``UnstructurePrune`` & ``Distillation``: 非结构化稀疏和蒸馏的策略;
-                         5) ``Distillation``: 单独单蒸馏策略;
-                         6) ``MultiTeacherDistillation``: 多teacher蒸馏策略。
+                         3) ``ChannelPrune`` & ``Distillation``: 结构化剪枝和蒸馏的策略;
+                         4) ``ASPPrune`` & ``Distillation``: ASP结构化剪枝和蒸馏的策略;
+                         5) ``TransformerPrune`` & ``Distillation``: Transformer结构化剪枝和蒸馏的策略;
+                         6) ``UnstructurePrune`` & ``Distillation``: 非结构化稀疏和蒸馏的策略;
+                         7) ``Distillation``: 单独单蒸馏策略;
+                         8) ``MultiTeacherDistillation``: 多teacher蒸馏策略。
            设置为None的话会自动的选择策略去做压缩。默认：None。
 - **eval_callback(function, 可选)** - eval回调函数，使用回调函数判断模型训练情况, 回调函数的写法参考： `<//github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/static/auto-compression/custom_function.rst>`_ 。 ``eval_callback`` 和 ``eval_dataloader`` 不能都设置为None。默认：None。
 - **eval_dataloader(paddle.io.Dataloader, 可选)** - 如果传入测试数据迭代器，则使用 ``EMD`` 距离判断压缩前后模型之间的差别，目前仅支持离线量化超参搜索使用这种方式判断压缩前后模型的压缩。
@@ -62,11 +64,11 @@ AutoCompression

   default_distill_config = {

-       "distill_loss": args.distill_loss,
+       "loss": args.loss,

-       "distill_node_pair": args.distill_node_pair,
+       "node": args.node,

-       "distill_lambda": args.distill_lambda,
+       "alpha": args.alpha,

       "teacher_model_dir": args.teacher_model_dir,

@@ -84,7 +86,7 @@ AutoCompression

                        strategy_config="Quantization": Quantization(**default_ptq_config), 

-                        "HyperParameterOptimization": HyperParameterOptimization(**default_hpo_config)}, \
+                        "Distillation": HyperParameterOptimization(**default_distill_config)}, \

                        train_config=None, train_dataloader=train_dataloader, eval_callback=eval_dataloader,devices='gpu')

@@ -104,12 +106,14 @@ TrainConfig
 **参数：**

 - **epochs(int)** - 训练的轮数，表明当前数据集需要训练几次。
- **learning_rate(float|LRScheduler)** - 模型优化过程中的学习率。
- **optimizer(str)** - 使用的优化器，需要是 ``paddle.optimizer`` 中优化器的名字, 例如: ``SGD`` 。
- **optim_args(dict)** - 优化器参数。可以指定以下参数：
-                        ``grid_clip`` ，指名使用的梯度裁剪的方法，需要是 ``paddle.nn`` 中梯度裁剪的类的名字，例如:  ``ClipGradByValue`` 等。 
-                        ``grad_clip_args`` ，梯度裁剪方法中的参数，例如：梯度裁剪选择的方式为 ``ClipGradByValue`` ，那么 ``grad_clip_args`` 可以设置的参数为 ``max`` 和 ``min`` ，参考: `ClipGradByValue <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/ClipGradByValue_cn.html#clipgradbyvalue>`_ 。
-                        其他优化器中可能需要的参数，例如: ``beta1``, ``beta2``, ``apply_decay_param_fun`` 等，参考: `AdamW <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/optimizer/AdamW_cn.html#adamw>`_ 。
+- **train_iter(int, optional)** 训练的迭代次数，表明需要迭代多少批次的数据，和 ``epoch`` 之间仅需要设置一个。
+- **learning_rate(float|dict)** - 模型优化过程中的学习率, 如果是dict类型，则dict的关键字如下： ``type``: 学习率策略的类名，可参考 ``paddle.optimizer.lr`` 中的类设置,
+                                  其它关键字根据实际调用的学习率的策略中的参数设置。
+- **optimizer_builder(dict)** - 使用的优化器和相关配置。dict中对应的关键字如下：
+                        ``optimizer(dict)``: 指定关键字 ``type`` 需要是 ``paddle.optimizer`` 中优化器的类名, 例如: ``SGD`` ，其他关键字根据具体使用的优化器中的参数设置。
+                        ``weight_decay(float, optional)``: 压缩训练过程中的参数衰退。
+                        ``regularizer(dict)``: 指定关键字 ``type`` 需要是 ``paddle.regularizer`` 中的权重衰减正则类名，其他关键字根据具体使用的类中的参数设置。
+                        ``grid_clip`` ，指名使用的梯度裁剪的方法，需要是 ``paddle.nn`` 中梯度裁剪的类的名字，例如:  ``ClipGradByValue`` 等，其他关键字根据具体使用的类中的参数设置。 

 - **eval_iter(int)** - 训练多少batch的数据进行一次测试。
 - **logging_iter(int)** - 训练多少batch的数据进行一次打印。
@@ -124,7 +128,7 @@ TrainConfig
                                     参考接口： `amp_config <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp_configs>`_ 来进行相对应的参数配置。
 - **recompute_config(dict, optional)** - 使用fleet api的前提下可以使用recompute显存优化逻辑。参数按照fleet 接口中所描述的进行配置： `recompute_configs <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute_configs>`_ 。
 - **sharding_config(dict, optional)** - 使用fleet api的前提下可以使用sharding 策略。参数按照fleet 接口中所描述的进行配置： `sharding_configs <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding_configs>`_ 。
-
+- **sparse_model(bool, optional)** - 设置 ``sparse_model`` 为 True, 可以移出非结构化稀疏产出的模型中多余的mask tensor的变量，默认: False.

 Quantization
 ----------
@@ -147,14 +151,13 @@ Distillation

 **参数：**

- **distill_loss(str|list[str])** - 蒸馏损失名字，可以设置的损失类型为paddleslim中支持的蒸馏损失，可选的损失函数有: ``fsp_loss``, ``l2_loss``, ``soft_label_loss`` 。如果您需要其他损失函数，可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算，或者通过提issue的方式我们来协助解决。
+- **loss(str|list[str])** - 蒸馏损失名字，可以设置的损失类型为paddleslim中支持的蒸馏损失，可选的损失函数有: ``fsp``, ``l2``, ``soft_label`` 。如果您需要其他损失函数，可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算，或者通过提issue的方式我们来协助解决。
 。
- **distill_node_pair(list[str])** - 蒸馏节点名字列表，每两个节点组成一对，分别属于教师模型和学生模型。
- **distill_lambda(float|list[float])** - 每一个蒸馏损失的权重，长度需要和 ``distill_loss`` 的长度保持一致。
+- **node(list[str])** - 蒸馏节点名字列表，可以选择：1. 使用自蒸馏的话，蒸馏结点仅包含学生网络节点即可, 支持多节点蒸馏; 2. 使用其他蒸馏的话，蒸馏节点需要包含教师网络节点和对应的学生网络节点, 每两个节点组成一对，分别属于教师模型和学生模型。
+- **alpha(float|list[float])** - 每一个蒸馏损失的权重，长度需要和 ``loss`` 的长度保持一致。
 - **teacher_model_dir(str)** - 教师模型的目录。
 - **teacher_model_filename(str)** - 教师模型的模型文件名字。
 - **teacher_params_filename(str)** - 教师模型的参数文件名字。
- **merge_feed(bool)** - 蒸馏过程是否需要共享同一个输入数据。默认： ``True`` 。


 MultiTeacherDistillation
@@ -164,14 +167,13 @@ MultiTeacherDistillation

 **参数：**

- **distill_loss(list[str])** - 蒸馏损失名字，可以设置的损失类型为paddleslim中支持的蒸馏损失，可选的损失函数有: ``fsp_loss``, ``l2_loss``, ``soft_label_loss`` 。如果您需要其他损失函数，可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算，或者通过提issue的方式我们来协助解决。
+- **loss(list[str])** - 蒸馏损失名字，可以设置的损失类型为paddleslim中支持的蒸馏损失，可选的损失函数有: ``fsp``, ``l2``, ``soft_label`` 。如果您需要其他损失函数，可以暂时通过向 `蒸馏损失文件<https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py>`_ z中添加相应的损失函数计算，或者通过提issue的方式我们来协助解决。
 。
- **distill_node_pair(list[list[str]])** - 蒸馏节点名字嵌套列表，教师模型的个数和外部列表的长度需要保持一致。每一个列表代表一个教师模型和学生模型直接的蒸馏节点，其中每两个节点组成一对，分别属于教师模型和学生模型。
- **distill_lambda(list[float])** - 每一个蒸馏损失的权重，长度需要和 ``distill_loss`` 的长度保持一致。
+- **node(list[list[str]])** - 蒸馏节点名字嵌套列表，教师模型的个数和外部列表的长度需要保持一致。每一个列表代表一个教师模型和学生模型直接的蒸馏节点，其中每两个节点组成一对，分别属于教师模型和学生模型。
+- **alpha(list[float])** - 每一个蒸馏损失的权重，长度需要和 ``distill_loss`` 的长度保持一致。
 - **teacher_model_dir(list[str])** - 教师模型的目录列表。
 - **teacher_model_filename(list[str])** - 教师模型的模型文件名字列表。
 - **teacher_params_filename(list[str])** - 教师模型的参数文件名字列表。
- **merge_feed(bool)** - 蒸馏过程是否需要共享同一个输入数据。默认： ``True`` 。


 HyperParameterOptimization

--- a/docs/zh_cn/api_cn/static/dist/single_distiller_api.rst
+++ b/docs/zh_cn/api_cn/static/dist/single_distiller_api.rst
@@ -49,16 +49,16 @@ merge
                             data_name_map, place)


-fsp_loss
+fsp
 ---------

-.. py:function:: paddleslim.dist.fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=None)
+.. py:function:: paddleslim.dist.fsp(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=None)

 `[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L90>`_

-为program内的teacher var和student var添加fsp_loss.
+为program内的teacher var和student var添加fsp loss.

-fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_
+fsp loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_

 **参数：**

@@ -70,7 +70,7 @@ fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Net

 **返回：**

- (Variable): 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp_loss
+- (Variable): 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp loss

 **使用示例：**

@@ -96,15 +96,15 @@ fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Net
   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
   dist.merge(teacher_program, student_program, data_name_map, place)
   with fluid.program_guard(student_program):
-       distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
+       distillation_loss = dist.fsp('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
                                         's1.tmp_1', 's2.tmp_1', student_program)



-l2_loss
+l2
 ------------

-.. py:function:: paddleslim.dist.l2_loss(teacher_var_name, student_var_name, program=None)
+.. py:function:: paddleslim.dist.l2(teacher_var_name, student_var_name, program=None)

 `[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L118>`_

@@ -144,15 +144,15 @@ l2_loss
   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
   dist.merge(teacher_program, student_program, data_name_map, place)
   with fluid.program_guard(student_program):
-       distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1',
+       distillation_loss = dist.l2('teacher_t2.tmp_1', 's2.tmp_1',
                                        student_program)



-soft_label_loss
+soft_label
 -------------------

-.. py:function:: paddleslim.dist.soft_label_loss(teacher_var_name, student_var_name, program=None, teacher_temperature=1., student_temperature=1.)
+.. py:function:: paddleslim.dist.soft_label(teacher_var_name, student_var_name, program=None, teacher_temperature=1., student_temperature=1.)

 `[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L136>`_

@@ -170,7 +170,7 @@ soft_label_loss出自论文 `Distilling the Knowledge in a Neural Network <https

 **返回：**

- (Variable): 由teacher_var, student_var组合得到的soft_label_loss
+- (Variable): 由teacher_var, student_var组合得到的soft label loss

 **使用示例：**


--- a/docs/zh_cn/quick_start/static/distillation_tutorial.md
+++ b/docs/zh_cn/quick_start/static/distillation_tutorial.md
@@ -92,7 +92,7 @@ merge操作将student_program和teacher_program中的所有Tensor和Op都将被
 data_name_map = {'image': 'image'}
 main = slim.dist.merge(teacher_program, student_program, data_name_map, paddle.CPUPlace())
 with paddle.static.program_guard(student_program, student_startup):
-    l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
+    l2_loss = slim.dist.l2('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
    loss = l2_loss + avg_cost
    opt = paddle.optimizer.Momentum(0.01, 0.9)
    opt.minimize(loss)

--- a/paddleslim/auto_compression/auto_strategy.py
+++ b/paddleslim/auto_compression/auto_strategy.py
@@ -92,7 +92,6 @@ def create_strategy_config(strategy_str, model_type):
            ### default prune config
            default_prune_config = {
                'pruned_ratio': float(tmp_s[1]),
-                'prune_algo': 'prune',
                'criterion': 'l1_norm'
            }
        else:
@@ -105,10 +104,12 @@ def create_strategy_config(strategy_str, model_type):
                'local_sparsity': True,
                'prune_params_type': 'conv1x1_only'
            }
-        tmp_s[0] = tmp_s[0].replace('prune', 'Prune')
+        if model_type == 'transformer':
+            tmp_s[0] = tmp_s[0].replace('prune', 'TransformerPrune')
+            default_prune_config = {'pruned_ratio': float(tmp_s[1])}
+        else:
+            tmp_s[0] = tmp_s[0].replace('prune', 'Prune')
        tmp_s[0] = tmp_s[0].replace('sparse', 'UnstructurePrune')
-        if model_type == 'transformer' and tmp_s[0] == 'Prune':
-            default_prune_config['prune_algo'] = 'transformer_pruner'
        prune_config = eval(tmp_s[0])(**default_prune_config)
        configs.append({tmp_s[0]: prune_config, 'Distillation': dis_config})


--- a/paddleslim/auto_compression/compressor.py
+++ b/paddleslim/auto_compression/compressor.py
@@ -82,15 +82,21 @@ class AutoCompression:
                2. set ``Quantization`` and ``HyperParameterOptimization`` to get quant_post and hyperparameter optimization compress config.
                    The Quantization config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L24`_ .
                    The HyperParameterOptimization config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L73`_ .
-                3. set ``Prune`` and ``Distillation`` to get prune and distillation compress config.
-                    The Prune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
+                3. set ``ChannelPrune`` and ``Distillation`` to get channel prune and distillation compress config.
+                    The ChannelPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
                    The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
-                4. set ``UnstructurePrune`` and ``Distillation`` to get unstructureprune and distillation compress config.
+                4. set ``ASPPrune`` and ``Distillation`` to get asp prune and distillation compress config.
+                    The ASPPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
+                    The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
+                5. set ``TransformerPrune`` and ``Distillation`` to get transformer prune and distillation compress config.
+                    The TransformerPrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L82`_ .
+                    The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
+                6. set ``UnstructurePrune`` and ``Distillation`` to get unstructureprune and distillation compress config.
                    The UnstructurePrune config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L91`_ .
                    The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
-                5. set ``Distillation`` to use one teacher modol to distillation student model.
+                7. set ``Distillation`` to use one teacher modol to distillation student model.
                    The Distillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L39`_ .
-                6. set ``MultiTeacherDistillation`` to use multi-teacher to distillation student model.
+                8. set ``MultiTeacherDistillation`` to use multi-teacher to distillation student model.
                    The MultiTeacherDistillation config can reference `https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/auto_compression/strategy_config.py#L56`_ .

                If set to None, will choose a strategy automatically. Default: None.
@@ -149,6 +155,8 @@ class AutoCompression:

        self._strategy, self._config = self._prepare_strategy(
            self.strategy_config)
+        #print(self._strategy, self._config[0].__dict__)
+        #sys.exit()

        # If train_config is None, set default train_config
        if self.train_config is None:
@@ -189,14 +197,15 @@ class AutoCompression:
        for strategy_c in strategy_config:
            quant_config = strategy_c.get("Quantization", None)
            hpo_config = strategy_c.get("HyperParameterOptimization", None)
-            prune_config = strategy_c.get("Prune", None)
+            prune_config = strategy_c.get("ChannelPrune", None)
+            asp_config = strategy_c.get("ASPPrune", None)
+            transformer_prune_config = strategy_c.get("TransformerPrune", None)
            unstructure_prune_config = strategy_c.get("UnstructurePrune", None)
            single_teacher_distill_config = strategy_c.get("Distillation", None)
            if single_teacher_distill_config is not None and single_teacher_distill_config.teacher_model_dir is None:
-                single_teacher_distill_config = single_teacher_distill_config._replace(
-                    teacher_model_dir=self.model_dir,
-                    teacher_model_filename=self.model_filename,
-                    teacher_params_filename=self.params_filename)
+                single_teacher_distill_config.teacher_model_dir = self.model_dir
+                single_teacher_distill_config.teacher_model_filename = self.model_filename
+                single_teacher_distill_config.teacher_params_filename = self.params_filename

            multi_teacher_distill_config = strategy_c.get(
                "MultiTeacherDistillation", None)
@@ -219,17 +228,29 @@ class AutoCompression:

            ### case3: prune_config & distill config
            elif prune_config is not None and self._distill_config is not None:
-                strategy.append('prune_dis')
+                strategy.append('channel_prune_dis')
                config.append(merge_config(prune_config, self._distill_config))

-            ### case4: unstructure_config & distill config
+            ### case4: asp_config & distill config
+            elif asp_config is not None and self._distill_config is not None:
+                strategy.append('asp_prune_dis')
+                config.append(merge_config(asp_config, self._distill_config))
+
+            ### case5: transformer_prune_config & distill config
+            elif transformer_prune_config is not None and self._distill_config is not None:
+                strategy.append('transformer_prune_dis')
+                config.append(
+                    merge_config(transformer_prune_config,
+                                 self._distill_config))
+
+            ### case6: unstructure_config & distill config
            elif unstructure_prune_config is not None and self._distill_config is not None:
                strategy.append('unstructure_prune_dis')
                config.append(
                    merge_config(unstructure_prune_config,
                                 self._distill_config))

-            ### case4: distill_config
+            ### case7: distill_config
            elif self._distill_config is not None:
                if single_teacher_distill_config is not None:
                    strategy.append('single_teacher_dis')
@@ -272,7 +293,7 @@ class AutoCompression:
        train_program_info = ProgramInfo(startup_program, train_program,
                                         feed_target_names, fetch_targets)

-        config_dict = dict(config._asdict())
+        config_dict = config.__dict__
        if "prune_strategy" in config_dict and config_dict[
                "prune_strategy"] == "gmp" and config_dict[
                    'gmp_config'] is None:
@@ -313,7 +334,7 @@ class AutoCompression:
                self._exe,
                self._places,
                config_dict,
-                self.train_config._asdict(),
+                self.train_config.__dict__,
                train_program_info,
                pruner=self._pruner,
                dist_strategy=dist_strategy,
@@ -345,7 +366,7 @@ class AutoCompression:
                train_program_info.optimizer.amp_init(
                    self._places, scope=paddle.static.global_scope())

-        if 'prune_algo' in config_dict and config_dict['prune_algo'] == 'asp':
+        if 'asp' in strategy:
            ### prune weight in scope
            self._pruner.prune_model(train_program_info.program)


--- a/paddleslim/auto_compression/config_helpers.py
+++ b/paddleslim/auto_compression/config_helpers.py
@@ -33,7 +33,7 @@ def load_config(config_path):

    compress_config = {}
    for key, value in cfg.items():
-        default_key = eval(key)(**value)
+        default_key = eval(key)(**value) if value is not None else eval(key)()
        compress_config[key] = default_key

    if compress_config.get('TrainConfig') != None:

--- a/paddleslim/auto_compression/create_compressed_program.py
+++ b/paddleslim/auto_compression/create_compressed_program.py
@@ -17,6 +17,7 @@ import numpy as np
 import paddle
 import paddle.distributed.fleet as fleet
 import paddle.optimizer as optimizer
+import paddle.regularizer as regularizer
 from ..quant.quanter import quant_aware, _quant_config_default, _parse_configs, pact, get_pact_optimizer
 from ..dist import *
 from ..common.recover_program import recover_inference_program, _remove_fetch_node
@@ -44,37 +45,73 @@ def _create_lr_scheduler(train_config):

 def _create_optimizer(train_config):
    """create optimizer"""
-    opt = getattr(optimizer, train_config.get('optimizer') or
-                  'SGD')  ### default optimizer is SGD
-    if 'optim_args' in train_config:
-        if train_config[
-                'optim_args'] is not None and 'grad_clip' in train_config[
-                    'optim_args'] and train_config['optim_args'][
-                        'grad_clip'] is not None:
-            grad_clip = getattr(
-                paddle.nn, train_config['optim_args']['grad_clip'])(
-                    **train_config['optim_args']['grad_clip_args'])
-            train_config['optim_args'].pop('grad_clip')
-            train_config['optim_args'].pop('grad_clip_args')
-        else:
-            grad_clip = None
-            if 'grad_clip' in train_config['optim_args'] and train_config[
-                    'optim_args']['grad_clip'] is None:
-                train_config['optim_args'].pop('grad_clip')
-                train_config['optim_args'].pop('grad_clip_args')
+
+    if 'optimizer_builder' not in train_config:
+        train_config['optimizer_builder'] = {'optimizer': {'type': 'SGD'}}
+
+    optimizer_builder = train_config['optimizer_builder']
+
+    if 'grad_clip' in optimizer_builder:
+        g_clip_params = optimizer_builder['grad_clip']
+        g_clip_type = g_clip_params.pop('type')
+        grad_clip = getattr(paddle.nn, g_clip_type)(**g_clip_params)
    else:
-        train_config['optim_args'] = {}
        grad_clip = None

+    ### build regularization
+    if 'regularizer' in optimizer_builder:
+        reg_params = optimizer_builder['regularizer']
+        reg_type = reg_params.pop('type')
+        reg = getattr(regularizer, reg_type)(**reg_params)
+    elif 'weight_decay' in optimizer_builder:
+        reg = optimizer_builder.pop('weight_decay')
+    else:
+        reg = None
+
+    ### build learning rate
    lr = _create_lr_scheduler(train_config)
-    op = opt(learning_rate=lr,
-             grad_clip=grad_clip,
-             **train_config['optim_args'])
-    return op, lr
+
+    ### build optimizer
+    optim_params = optimizer_builder['optimizer']
+    optim_type = optim_params.pop('type')
+    opt = getattr(optimizer, optim_type)(learning_rate=lr,
+                                         grad_clip=grad_clip,
+                                         weight_decay=reg,
+                                         **optim_params)
+    return opt, lr
+
+
+def _get_distill_node(student_program, config):
+    node = config.get('node')
+    if len(node) == 0:
+        return None
+
+    ### the type of node is list or list(list)
+    if isinstance(node[0], list):
+        test_node = node[0][0]
+    else:
+        test_node = node[0]
+    try:
+        test_var = student_program.global_block().var(test_node)
+        distill_node_pair = []
+        if isinstance(node[0], list):
+            for n_list in node:
+                tmp_node_pair = []
+                for n in n_list:
+                    tmp_node_pair.append('teacher_' + n)
+                    tmp_node_pair.append(n)
+                distill_node_pair.append(tmp_node_pair)
+        else:
+            for n in node:
+                distill_node_pair.append('teacher_' + n)
+                distill_node_pair.append(n)
+        return distill_node_pair
+    except:
+        return node


 def _parse_distill_loss(distill_node_pair,
-                        distill_loss='l2_loss',
+                        distill_loss='l2',
                        distill_lambda=1.0):
    """parse distill loss config"""
    loss_dist = 0.0
@@ -135,9 +172,9 @@ def _load_program_and_merge(executor,

    data_name_map = {}

-    if 'merge_feed' not in config or config['merge_feed'] == True:
-        assert len(feed_target_names) == len(teacher_feed_target_names), \
-            "the number of feed nodes in the teacher model is not equal to the student model"
+    merge_feed = (
+        sorted(feed_target_names) == sorted(teacher_feed_target_names))
+    if merge_feed == True:
        for i, name in enumerate(feed_target_names):
            data_name_map[teacher_feed_target_names[i]] = name

@@ -153,7 +190,7 @@ def _load_program_and_merge(executor,
        place,
        teacher_scope=new_scope,
        name_prefix=teacher_name_prefix,
-        merge_feed=config.get('merge_feed') or True)
+        merge_feed=merge_feed)
    if teacher_idx == None or teacher_idx == 1:
        return train_program, test_program, data_name_map
    else:
@@ -180,6 +217,9 @@ def build_distill_program(executor,
        feed_target_names = train_program_info.feed_target_names
        fetch_targets = train_program_info.fetch_targets

+    distill_node_pair = _get_distill_node(train_program,
+                                          config) or default_distill_node_pair
+
    teacher_model_dir = config[
        "teacher_model_dir"] if "teacher_model_dir" in config else config[
            "teacher_model_path_prefix"]
@@ -270,16 +310,15 @@ def build_distill_program(executor,
                        **train_config['amp_config'])

            distill_loss, losses = _parse_distill_loss(
-                config.get('distill_node_pair') or default_distill_node_pair,
-                config.get('distill_loss') or
-                'l2_loss',  ### default loss is l2_loss
-                config.get('distill_lambda') or 1.0)  ### default lambda is 1.0
+                distill_node_pair,
+                config.get('loss') or 'l2',  ### default loss is l2
+                config.get('alpha') or 1.0)  ### default alpha is 1.0
            loss = paddle.mean(distill_loss)
            loss.stop_gradient = False

-            if 'prune_algo' in config:  ### prune & asp
-                if config['prune_algo'] == 'asp' and not train_config.get(
-                        'use_fleet'):
+            if 'prune_params_name' in config:  ### prune
+                if 'pruned_ratio' not in config and not train_config.get(
+                        'use_fleet'):  ### asp
                    optimizer = pruner.decorate(optimizer)
                optimizer.minimize(loss)
            elif 'prune_strategy' in config:  ###unstructure prune
@@ -302,11 +341,8 @@ def build_quant_program(executor, place, config, train_program_info,
    scope = paddle.static.global_scope()

    assert isinstance(config, dict), "quant config must be dict"
-    default_config = _quant_config_default
-    default_config.update(config)
-    config = _parse_configs(default_config)

-    use_pact = config["use_pact"]
+    use_pact = config.pop("use_pact")
    if use_pact:
        act_preprocess_func = pact
        optimizer_func = get_pact_optimizer
@@ -364,13 +400,13 @@ def build_prune_program(executor,
                        strategy,
                        patterns,
                        eval_dataloader=None):
-    if 'unstructure' in strategy:
+    if strategy.startswith('unstructure'):
        from ..prune.unstructured_pruner import UnstructuredPruner, GMPUnstructuredPruner
        if config["prune_strategy"] is None:
            pruner = UnstructuredPruner(
                train_program_info.program,
                mode=config['prune_mode'],
-                ratio=config['pruned_ratio'],
+                ratio=config['ratio'],
                threshold=config['threshold'],
                prune_params_type=config['prune_params_type'],
                place=place,
@@ -378,69 +414,65 @@ def build_prune_program(executor,
        elif config["prune_strategy"] == "gmp":
            pruner = GMPUnstructuredPruner(
                train_program_info.program,
-                ratio=config['pruned_ratio'],
+                ratio=config['ratio'],
                prune_params_type=config['prune_params_type'],
                place=place,
                local_sparsity=config['local_sparsity'],
                configs=config['gmp_config'])
+    elif strategy.startswith('channel_prune'):
+        from ..prune import Pruner
+        pruner = Pruner(config["criterion"])
+        params = []
+        ### TODO(ceci3): set default prune weight
+        for param in train_program_info.program.global_block().all_parameters():
+            if config['prune_params_name'] is not None and param.name in config[
+                    'prune_params_name']:
+                params.append(param.name)
+
+        pruned_program, _, _ = pruner.prune(
+            train_program_info.program,
+            paddle.static.global_scope(),
+            params=params,
+            ratios=[config['pruned_ratio']] * len(params),
+            place=place)
+        train_program_info.program = pruned_program
+
+    elif strategy.startswith('asp'):
+        from paddle.static import sparsity
+        pruner = sparsity
+        excluded_params_name = []
+        ### TODO(ceci3): set default prune weight
+        for param in train_program_info.program.global_block().all_parameters():
+            if config[
+                    'prune_params_name'] is not None and param.name not in config[
+                        'prune_params_name']:
+                excluded_params_name.append(param.name)
+            if "teacher_" in param.name:
+                excluded_params_name.append(param.name)
+        pruner.set_excluded_layers(train_program_info.program,
+                                   excluded_params_name)
+    elif strategy.startswith('transformer_prune'):
+        from .transformer_pruner import TransformerPruner
+        assert eval_dataloader is not None, "transformer_pruner must set eval_dataloader"
+        label_info = _get_label_info(eval_dataloader,
+                                     train_program_info.feed_target_names)
+        assert len(label_info) != 0, \
+            "maybe something wrong in get label name from eval_dataloader, please check your eval_dataloader"
+        pruner = TransformerPruner(
+            executor,
+            place,
+            train_program_info.program,
+            patterns,
+            label_info,
+            width_mult=(1.0 - config['pruned_ratio']),
+            dataloader=eval_dataloader,
+            fetch_targets=train_program_info.fetch_targets)
+        pruned_program = pruner.prune()
+        train_program_info.program = pruned_program
    else:
-        if config['prune_algo'] == 'prune':
-            from ..prune import Pruner
-            pruner = Pruner(config["criterion"])
-            params = []
-            ### TODO(ceci3): set default prune weight
-            for param in train_program_info.program.global_block(
-            ).all_parameters():
-                if config[
-                        'prune_params_name'] is not None and param.name in config[
-                            'prune_params_name']:
-                    params.append(param.name)
-
-            pruned_program, _, _ = pruner.prune(
-                train_program_info.program,
-                paddle.static.global_scope(),
-                params=params,
-                ratios=[config['pruned_ratio']] * len(params),
-                place=place)
-            train_program_info.program = pruned_program
-
-        elif config['prune_algo'] == 'asp':
-            from paddle.static import sparsity
-            pruner = sparsity
-            excluded_params_name = []
-            ### TODO(ceci3): set default prune weight
-            for param in train_program_info.program.global_block(
-            ).all_parameters():
-                if config[
-                        'prune_params_name'] is not None and param.name not in config[
-                            'prune_params_name']:
-                    excluded_params_name.append(param.name)
-                if "teacher_" in param.name:
-                    excluded_params_name.append(param.name)
-            pruner.set_excluded_layers(train_program_info.program,
-                                       excluded_params_name)
-        elif config['prune_algo'] == 'transformer_pruner':
-            from .transformer_pruner import TransformerPruner
-            assert eval_dataloader is not None, "transformer_pruner must set eval_dataloader"
-            label_info = _get_label_info(eval_dataloader,
-                                         train_program_info.feed_target_names)
-            assert len(label_info) != 0, \
-                "maybe something wrong in get label name from eval_dataloader, please check your eval_dataloader"
-            pruner = TransformerPruner(
-                executor,
-                place,
-                train_program_info.program,
-                patterns,
-                label_info,
-                width_mult=(1.0 - config['pruned_ratio']),
-                dataloader=eval_dataloader,
-                fetch_targets=train_program_info.fetch_targets)
-            pruned_program = pruner.prune()
-            train_program_info.program = pruned_program
-        else:
-            raise NotImplementedError(
-                "prune_algo must be choice in [\"prune\", \"asp\"], {} is not support".
-                format(config['prune_algo']))
+        raise NotImplementedError(
+            "prune_algo must be choice in [\"prune\", \"asp\"], {} is not support".
+            format(config['prune_algo']))

    return pruner, train_program_info


--- a/paddleslim/auto_compression/strategy_config.py
+++ b/paddleslim/auto_compression/strategy_config.py
@@ -16,122 +16,301 @@ from collections import namedtuple

 __all__ = [
    "Quantization", "Distillation", "MultiTeacherDistillation", \
-    "HyperParameterOptimization", "Prune", "UnstructurePrune",  \
-    "merge_config", "ProgramInfo", "TrainConfig",
+    "HyperParameterOptimization", "ChannelPrune", "UnstructurePrune",  \
+    "TransformerPrune", "ASPPrune", "merge_config", "ProgramInfo", "TrainConfig",
 ]

-### Quantization:
-Quantization = namedtuple(
-    "Quantization",
-    [
-        "quantize_op_types",
-        "weight_bits",
-        "activation_bits",
-        "not_quant_pattern",  # Only support in QAT
-        "use_pact",  # Only support in QAT
-        "is_full_quantize",
-        "activation_quantize_type",
-        "weight_quantize_type"
-    ])
-
-Quantization.__new__.__defaults__ = (None, ) * (
-    len(Quantization._fields) - 3) + (False, 'moving_average_abs_max',
-                                      'channel_wise_abs_max')
-
-### Distillation:
-Distillation = namedtuple(
-    "Distillation",
-    [
-        "distill_loss",  ### list[list]，支持不同节点之间使用不同的loss。
-        "distill_node_pair",  ### list[list]，支持不同节点之间使用不同的loss。
-        "distill_lambda",  ### list[list]，支持不同节点之间使用不同的loss。
-        "teacher_model_dir",
-        "teacher_model_filename",
-        "teacher_params_filename",
-        "merge_feed",
-    ])
-
-Distillation.__new__.__defaults__ = (None, ) * (len(Distillation._fields) - 1
-                                                ) + (True, )
-
-### 多teacher蒸馏配置
-### Multi-Teacher Distillation：
-MultiTeacherDistillation = namedtuple(
-    "MultiTeacherDistillation",
-    [
-        "distill_loss",  ### list[str]，每个teacher对应一个loss
-        "distill_node_pair",  ### list[list]，每个teacher对应一个蒸馏。仅支持logits蒸馏，不支持中间层蒸馏
-        "distill_lambda",  ### list[float]，每个teacher对应一个lambda。
-        "teacher_model_dir",
-        "teacher_model_filename",  ### list[str], 每个teacher对应一个模型文件
-        "teacher_params_filename",  ### list[str], 每个teacher对应一个参数文件
-        "merge_feed",
-    ])
-
-MultiTeacherDistillation.__new__.__defaults__ = (None, ) * (
-    len(MultiTeacherDistillation._fields) - 1) + (True, )
-
-### 不设置就按照默认的搜索空间进行超参搜索，设置的话按照设置的搜索空间搜索，这样可以支持单PTQ策略
-###HyperParameterOptimization
-HyperParameterOptimization = namedtuple("HyperParameterOptimization", [
-    "ptq_algo", "bias_correct", "weight_quantize_type", "hist_percent",
-    "batch_num", "max_quant_count"
-])
-
-HyperParameterOptimization.__new__.__defaults__ = (None, ) * (
-    len(HyperParameterOptimization._fields) - 1) + (20, )
-
-### Prune
-Prune = namedtuple("Prune", [
-    "prune_algo",
-    "pruned_ratio",
-    "prune_params_name",
-    "criterion",
-])
-Prune.__new__.__defaults__ = (None, ) * len(Prune._fields)
-
-### UnstructurePrune
-UnstructurePrune = namedtuple("UnstructurePrune", [
-    "prune_strategy",
-    "prune_mode",
-    "threshold",
-    "pruned_ratio",
-    "gmp_config",
-    "prune_params_type",
-    "local_sparsity",
-])
-UnstructurePrune.__new__.__defaults__ = (None, ) * len(UnstructurePrune._fields)
-
-### Train
-TrainConfig = namedtuple(
-    "Train",
-    [
-        "epochs",  # Training total epoch
-        "train_iter",  # Training total iteration, `epochs` or `train_iter` only need to set one.
-        "learning_rate",
-        "optimizer",
-        "optim_args",
-        "eval_iter",
-        "logging_iter",
-        "origin_metric",
-        "target_metric",
-        "use_fleet",
-        "amp_config",
-        "recompute_config",
-        "sharding_config",
-        "sparse_model",
-    ])
-
-TrainConfig.__new__.__defaults__ = (None, ) * len(TrainConfig._fields)
+
+class Quantization:
+    def __init__(self,
+                 quantize_op_types=[
+                     'conv2d', 'depthwise_conv2d', 'mul', 'matmul', 'matmul_v2'
+                 ],
+                 weight_bits=8,
+                 activation_bits=8,
+                 not_quant_pattern=['skip_quant'],
+                 use_pact=False,
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_quantize_type='channel_wise_abs_max',
+                 dtype='int8',
+                 window_size=10000,
+                 moving_rate=0.9,
+                 for_tensorrt=False,
+                 is_full_quantize=False):
+        """
+        Quantization Config.
+        Args:
+            quantize_op_types(list(str)): Ops of type in quantize_op_types, will be quantized. Default: ['conv2d', 'depthwise_conv2d', 'mul', 'matmul', 'matmul_v2'].
+            weight_bits(int): Weight quantize bit num. Default: 8.
+            activation_bits(int): Activation quantize bit num. Default 8.
+            not_quant_pattern(list(str)): Ops of name_scope in not_quant_pattern list, will not be quantized. Default: 'skip_quant'.
+            use_pact(bool): Whether to use pact in quantization training. Default: False.
+            activation_quantize_type(str): Activation quantize type. Default is 'moving_average_abs_max'.
+            weight_quantize_type(str): Weight quantize type. Default 'channel_wise_abs_max'.
+            dtype(str): Data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'.
+            window_size(int): Window size for 'range_abs_max' quantization. Default: 10000.
+            moving_rate(float): The decay coefficient of moving average. Default: 0.9.
+            for_tensorrt(bool): If True, 'quantize_op_types' will be TENSORRT_OP_TYPES. Default: False.
+            is_full_quantize(bool): If True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES. Default: False.
+        """
+        self.quantize_op_types = quantize_op_types
+        self.weight_bits = weight_bits
+        self.activation_bits = activation_bits
+        self.not_quant_pattern = not_quant_pattern
+        self.use_pact = use_pact
+        self.is_full_quantize = is_full_quantize
+        self.activation_quantize_type = activation_quantize_type
+        self.weight_quantize_type = weight_quantize_type
+        self.dtype = dtype
+        self.window_size = window_size
+        self.moving_rate = moving_rate
+        self.for_tensorrt = for_tensorrt
+        self.is_full_quantize = is_full_quantize
+
+
+class Distillation:
+    def __init__(self,
+                 loss='l2',
+                 node=[],
+                 alpha=1.0,
+                 teacher_model_dir=None,
+                 teacher_model_filename=None,
+                 teacher_params_filename=None):
+        """
+        Distillation Config.
+        Args:
+            loss(str|list(str)): Distillation loss, the type of loss can be set reference `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html>`_. If set list of loss, means the difference node can be set difference distill loss, the length of loss must equal to length of node. Default: 'l2'.
+            node(list(str)|list(list(str))): Distillation node, users can set node from the model before compress. If set list of list, every inner list used same distill loss, the length of list must equal to length of loss.  Default: [].
+            alpha(float|list(float)): The lambda of distillation loss. If set list of alpha, the length of alpha must equal to length of loss. Default: 1.0. 
+            teacher_model_dir(str, optional): The path of teacher inference model, and the model and params that saved by ``paddle.static.io.save_inference_model`` are under the path. If set to None, the teacher model will be set to the model before compress. Default: None.
+            teacher_model_filename(str, optional): The name of teacher model file. If parameters are saved in separate files, set it as 'None'. Default: 'None'.
+            teacher_params_filename(str, optional): The name of teacher params file. When all parameters are saved in a single file, set it as filename. If parameters are saved in separate files, set it as 'None'. Default : 'None'.
+        """
+        self.loss = loss
+        self.node = node
+        self.alpha = alpha
+        self.teacher_model_dir = teacher_model_dir
+        self.teacher_model_filename = teacher_model_filename
+        self.teacher_params_filename = teacher_params_filename
+
+
+class MultiTeacherDistillation:
+    def __init__(self,
+                 loss=[],
+                 node=[],
+                 alpha=[],
+                 teacher_model_dir=[],
+                 teacher_model_filename=[],
+                 teacher_params_filename=[]):
+        """
+        Multi-Teacher Distillation Config.
+        Args:
+            loss(list(str)): The list of distillation loss, the type of loss can be set reference `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/dist/single_distiller_api.html>`_. One-to-one correspondence between loss and teacher model. Default: [].
+            node(list(list(str))): Distillation node, users can set node from the model before compress. If set list of list, every inner list used same distill loss, the length of list must equal to length of loss.  Default: [].
+            alpha(list(float)): The list of lambda of distillation loss. One-to-one correspondence between alpha and loss. Default: []. 
+            teacher_model_dir(list): The list of path of teacher inference model, and the model and params that saved by ``paddle.static.io.save_inference_model`` are under the path. If set to None, the teacher model will be set to the model before compress. Default: None.
+            teacher_model_filename(list): The list of name of teacher model file. If parameters are saved in separate files, set it as 'None'. Default: 'None'.
+            teacher_params_filename(list): The list of name of teacher params fie. When all parameters are saved in a single file, set it as filename. If parameters are saved in separate files, set it as 'None'. Default : 'None'.
+        """
+        self.loss = loss
+        self.node = node
+        self.alpha = alpha
+        self.teacher_model_dir = teacher_model_dir
+        self.teacher_model_filename = teacher_model_filename
+        self.teacher_params_filename = teacher_params_filename
+
+
+class HyperParameterOptimization:
+    def __init__(self,
+                 ptq_algo=["KL", "hist", "avg", "mse"],
+                 bias_correct=[True, False],
+                 weight_quantize_type=['channel_wise_abs_max'],
+                 hist_percent=[0.98, 0.999],
+                 batch_num=[10, 30],
+                 max_quant_count=20):
+        """
+        HyperParameterOptimization Config.
+        Args:
+            ptq_algo(list(str)): Post-Training Quantization algorithm, can be set reference the algo from `<https://paddleslim.readthedocs.io/zh_CN/latest/api_cn/static/quant/quantization_api.html#quant-post-static>`_.
+            bias_correct(list(bool)): Whether to use bias_correct.
+            weight_quantize_type(list(str)): Quantization type for weight, can be set from 'channel_abs_max' or 'abs_max'.
+            hist_percent(list(float)): The upper and lower bounds of threshold of algo 'hist' for activations, the real percent is uniform sampling in this bounds.
+            batch_num(list(int)): The upper and lower bounds of batch number, the real batch number is uniform sampling in this bounds.
+            max_quant_count(int): Max number of model quantization. Default: 20.
+        """
+        self.ptq_algo = ptq_algo
+        self.bias_correct = bias_correct
+        self.weight_quantize_type = weight_quantize_type
+        self.hist_percent = hist_percent
+        self.batch_num = batch_num
+        self.max_quant_count = max_quant_count
+
+
+class ChannelPrune:
+    def __init__(self, pruned_ratio, prune_params_name, criterion='l1_norm'):
+        """
+        ChannelPrune Config.
+        Args:
+            pruned_ratio(float): The ratios to be pruned.
+            prune_params_name(list(str)): A list of parameter names to be pruned.
+            criterion(str|function): the criterion used to sort channels for pruning, can be choose from ['l1_norm', 'bn_scale', 'geometry_median']. Default: 'l1_norm'.
+        """
+        self.pruned_ratio = pruned_ratio
+        self.prune_params_name = prune_params_name
+        self.criterion = criterion
+
+
+class ASPPrune:
+    def __init__(self, prune_params_name):
+        """
+        ASPPrune Config.
+        Args:
+            prune_params_name(list(str)): A list of parameter names to be pruned.
+        """
+        self.prune_params_name = prune_params_name
+
+
+class TransformerPrune:
+    def __init__(self, pruned_ratio):
+        """
+        TransformerPrune Config.
+        Args:
+            pruned_ratio(float): The ratios to be pruned each fully-connected layer.
+        """
+        self.pruned_ratio = pruned_ratio
+
+
+class UnstructurePrune:
+    def __init__(self,
+                 prune_strategy=None,
+                 prune_mode='ratio',
+                 threshold=0.01,
+                 ratio=0.55,
+                 gmp_config=None,
+                 prune_params_type=None,
+                 local_sparsity=False):
+        """
+        UnstructurePrune Config.
+        Args:
+            prune_strategy(str, optional): The pruning strategy, currently we support base and gmp, ``None`` means use base pruning strategy. Default: ``None``.
+            prune_mode(str): The pruning mode: whether by ratio or by threshold. Default: 'ratio'.
+            threshold(float): The threshold to set zeros, the abs(weights) lower than which will be zeros. Default: 0.01.
+            ratio(float): The ratio to set zeros, the smaller portion will be zeros. Default: 0.55.
+            gmp_config(dict): The dictionary contains all the configs for GMP pruner. Default: None. The detailed description is as below:
+              .. code-block:: python
+                     
+                     {'stable_iterations': int} # the duration of stable phase in terms of global iterations
+                     {'pruning_iterations': int} # the duration of pruning phase in terms of global iterations
+                     {'tunning_iterations': int} # the duration of tunning phase in terms of global iterations
+                     {'resume_iteration': int} # the start timestamp you want to train from, in terms if global iteration
+                     {'pruning_steps': int} # the total times you want to increase the ratio
+                     {'initial_ratio': float} # the initial ratio value
+              
+              ..
+            prune_params_type(str): Which kind of params should be pruned, we only support None (all but norms) and conv1x1_only for now. Default: None.
+            local_sparsity(bool): Whether to prune all the parameter matrix at the same ratio or not. Default: False.
+        """
+        self.prune_strategy = prune_strategy
+        self.prune_mode = prune_mode
+        self.threshold = threshold
+        self.ratio = ratio
+        self.gmp_config = gmp_config
+        self.prune_params_type = prune_params_type
+        self.local_sparsity = local_sparsity
+
+
+class TrainConfig:
+    def __init__(self,
+                 epochs=None,
+                 train_iter=None,
+                 learning_rate=0.02,
+                 optimizer_builder={'optimizer': 'SGD'},
+                 eval_iter=1000,
+                 logging_iter=10,
+                 origin_metric=None,
+                 target_metric=None,
+                 use_fleet=False,
+                 amp_config=None,
+                 recompute_config=None,
+                 sharding_config=None,
+                 sparse_model=False):
+        """
+        Train Config.
+        Args:
+            epochs(int): The number of total epochs. Default: None.
+            train_iter(int):  Training total iteration, `epochs` or `train_iter` only need to set one. Default: None.
+            learning_rate(float|dict): learning rate in the training. If set dict, the detailed description of learning_rate is as blow: 
+              .. code-block:: python
+                     
+                  'type'(str) # the class name of learning rate decay, can reference in paddle.optimizer.lr.
+              ..
+              other keys in the learning_rate depend on the parameters in the class of learning rate decay. 
+              Such as, if you want to use ``PiecewiseDecay``, need to set learning_rate like: 
+              {'type': PiecewiseDecay, 'boundaries': [4500], 'values': [0.005, 0.0005]}.
+            optimizer_builder(str|dict): optimizer in th training. If set dict, the detailed description of optimizer_builder is as blow:
+              .. code-block:: python
+                     
+                  'optimizer'(dict) # the 'type' in the optimizer need to be the class name in the paddle.optimizer,  
+                                      other key of optimzer depend on the parameters in the class.
+                  'weight_decay(float, optional)' # weight decay in the training.
+                  'regularizer(dict)': # the 'type' in the regularizer need to be the class name in the paddle.regularizer, 
+                                         other key of optimzer depend on the parameters in the class.
+                  'grad_clip(dict)': # the 'type' in the grad_clip need to be the class name in the paddle.nn, such as: 'ClipGradByGlobalNorm',
+                                     other key of grad_clip depend on the parameters in the class.
+              ..
+            eval_iter(int): Test period in batches. Default: 1000.
+            logging_iter(int): Log period in batches. Default: 10.
+            origin_metric(float, optional): The Metric of model before compress, used to check whether the dataloader is correct if is not None. Default: None.
+            target_metric(float, optional): The Metric of model after compress, if set target metric, the metric of compressed model satisfy the requirements, will be stop training. If not set, will train epochs as users set. Default: None.
+            use_fleet(bool): Whether to use fleet. Default: False.
+            amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below if use_fleet=False: 
+              .. code-block:: python
+                 AMP-O1 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id2>`_ : 
+                     {'custom_white_list', set} # The custom white_list. It's the set of ops that support
+                         fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+                         will be converted to fp16.
+                     {'custom_black_list': set} # The custom black_list. The set of ops that support fp16
+                         calculation and are considered numerically-dangerous and whose effects may also be 
+                         observed in downstream ops. These ops will not be converted to fp16.
+                     {'custom_black_varnames': set} # Users' custom black varibles' names.
+
+                 AMP-O2 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id3>`_ : 
+                     {'use_pure_fp16': bool} # Whether to use the pure fp16 training.
+                     {'use_fp16_guard': bool} # Whether to use `fp16_guard` when constructing the program.
+              ..
+              If you want to use AMP-O2, you need to set use_pure_fp16 is True and use_fp16_guard is False.
+              If use_fleet=True, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_.
+
+            recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when use_fleet=True, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_. 
+            sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when use_fleet=True, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_.
+            sparse_model(bool, optional): Set sparse_model to ``True`` to remove mask tensor when the compress strategy is unstructure prune. Default: False.
+        """
+        self.epochs = epochs
+        self.train_iter = train_iter
+        self.learning_rate = learning_rate
+        self.optimizer_builder = optimizer_builder
+        self.eval_iter = eval_iter
+        self.logging_iter = logging_iter
+        self.origin_metric = origin_metric
+        self.target_metric = target_metric
+        self.use_fleet = use_fleet
+        self.amp_config = amp_config
+        self.recompute_config = recompute_config
+        self.sharding_config = sharding_config
+        self.sparse_model = sparse_model
+
+
+class MergeConfig:
+    def __init__(self, **kwargs):
+        for name, value in kwargs.items():
+            setattr(self, name, value)


 def merge_config(*args):
    fields = set()
    cfg = dict()
    for arg in args:
-        fields = fields.union(arg._fields)
-        cfg.update(dict(arg._asdict()))
-    MergeConfig = namedtuple("MergeConfig", fields)
+        cfg.update(arg.__dict__)
    return MergeConfig(**cfg)


@@ -143,6 +322,16 @@ class ProgramInfo:
                 fetch_targets,
                 optimizer=None,
                 learning_rate=None):
+        """
+        ProgramInfo Config.
+        Args:
+            startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#cn-api-fluid-default-startup-program>`_.
+            program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#cn-api-fluid-default-main-program>`_.
+            feed_target_names(list(str)): The name of feed tensor in the program.
+            fetch_targets(list(Variable)): The fetch variable in the program.
+            optimizer(Optimizer, optional): Optimizer in training. Default: None.
+            learning_rate(float|paddle.optimizer.lr, optional): learning_rate in training. Default: None.
+        """
        self.startup_program = startup_program
        self.program = program
        self.feed_target_names = feed_target_names

--- a/paddleslim/dist/__init__.py
+++ b/paddleslim/dist/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .single_distiller import merge, fsp_loss, l2_loss, soft_label_loss, loss
+from .single_distiller import merge, fsp, l2, soft_label, loss
 from .dml import DML
--- a/paddleslim/dist/single_distiller.py
+++ b/paddleslim/dist/single_distiller.py
@@ -54,7 +54,8 @@ def merge(teacher_program,
    teacher_program = teacher_program.clone(for_test=True)
    for teacher_var in teacher_program.list_vars():
        skip_rename = False
-        if teacher_var.name != 'fetch' and (not merge_feed or teacher_var.name != 'feed'):
+        if teacher_var.name != 'fetch' and (not merge_feed or
+                                            teacher_var.name != 'feed'):
            if teacher_var.name in data_name_map.keys():
                new_name = data_name_map[teacher_var.name]
                if new_name == teacher_var.name:
@@ -72,7 +73,8 @@ def merge(teacher_program,
                    teacher_var.name, new_name)

    for teacher_var in teacher_program.list_vars():
-        if teacher_var.name != 'fetch' and (not merge_feed or teacher_var.name != 'feed'):
+        if teacher_var.name != 'fetch' and (not merge_feed or
+                                            teacher_var.name != 'feed'):
            # student program add var
            new_var = student_program.global_block()._clone_variable(
                teacher_var, force_persistable=False)
@@ -111,11 +113,11 @@ def merge(teacher_program,
            op._op._set_attr("skip_quant", True)


-def fsp_loss(teacher_var1_name,
-             teacher_var2_name,
-             student_var1_name,
-             student_var2_name,
-             program=None):
+def fsp(teacher_var1_name,
+        teacher_var2_name,
+        student_var1_name,
+        student_var2_name,
+        program=None):
    """Combine variables from student model and teacher model by fsp-loss.

    Args:
@@ -149,7 +151,7 @@ def fsp_loss(teacher_var1_name,
    return fsp_loss


-def l2_loss(teacher_var_name, student_var_name, program=None):
+def l2(teacher_var_name, student_var_name, program=None):
    """Combine variables from student model and teacher model by l2-loss.

    Args:
@@ -170,11 +172,11 @@ def l2_loss(teacher_var_name, student_var_name, program=None):
    return l2_loss


-def soft_label_loss(teacher_var_name,
-                    student_var_name,
-                    program=None,
-                    teacher_temperature=1.,
-                    student_temperature=1.):
+def soft_label(teacher_var_name,
+               student_var_name,
+               program=None,
+               teacher_temperature=1.,
+               student_temperature=1.):
    """Combine variables from student model and teacher model by soft-label-loss.

    Args:

--- a/paddleslim/quant/quant_aware_with_infermodel.py
+++ b/paddleslim/quant/quant_aware_with_infermodel.py
@@ -27,7 +27,7 @@ import paddle.fluid as fluid
 from ..common.recover_program import recover_inference_program
 from .quanter import _quant_config_default, _parse_configs, pact, get_pact_optimizer
 from .quanter import quant_aware, convert
-from ..dist import merge, l2_loss, soft_label_loss, fsp_loss
+from ..dist import merge, l2, soft_label, fsp
 from ..auto_compression.create_compressed_program import build_distill_program
 import logging
 logging.getLogger().setLevel(logging.INFO)
@@ -57,7 +57,7 @@ _train_config_default = {
        and the teacher node and student node are arranged in pairs.
        for example, ["teacher_fc_0.tmp_0", "fc_0.tmp_0", "teacher_batch_norm_24.tmp_4", "batch_norm_24.tmp_4"]
    """
-    "distill_node_pair": None
+    "node": None
 }


@@ -91,12 +91,10 @@ def _parse_train_configs(train_config):
        "'teacher_model_path_prefix' must both be string"
    assert isinstance(configs['model_path_prefix'], str), \
        "'model_path_prefix' must both be str"
-    assert isinstance(configs['distill_node_pair'], list), \
-        "'distill_node_pair' must both be list"
-    assert len(configs['distill_node_pair']) > 0, \
-        "'distill_node_pair' not configured with distillation nodes"
-    assert len(configs['distill_node_pair']) % 2 == 0, \
-        "'distill_node_pair' distillation nodes need to be configured in pairs"
+    assert isinstance(configs['node'], list), \
+        "'node' must both be list"
+    assert len(configs['node']) > 0, \
+        "'node' not configured with distillation nodes"
    return train_config


@@ -143,7 +141,7 @@ def quant_aware_with_infermodel(executor,
        train_config(dict):train aware configs, include num_epoch, save_iter_step, learning_rate,
                weight_decay, use_pact, quant_model_ckpt_path,
                model_path_prefix, teacher_model_path_prefix,
-                distill_node_pair(teacher_node_name1, node_name1, teacher_node_name2, teacher_node_name2, ...)
+                node(node_name1, node_name2, ...)
        test_callback(callback function): callback function include two params: compiled test quant program and checkpoint save filename.
                user can implement test logic.
    Returns:
@@ -261,7 +259,7 @@ def export_quant_infermodel(
        train_config(dict):train aware configs, include num_epoch, save_iter_step, learning_rate,
                weight_decay, use_pact, quant_model_ckpt_path,
                model_path_prefix, teacher_model_path_prefix, 
-                distill_node_pair(teacher_node_name1, node_name1, teacher_node_name2, teacher_node_name2, ...)
+                node(node_name1, node_name2, ...)
        checkpoint_path(str): checkpoint path need to export quant infer model.
        export_inference_model_path_prefix(str): export infer model path prefix, storage directory of model + model name (excluding suffix).
    Returns:

--- a/tests/test_fsp_loss.py
+++ b/tests/test_fsp_loss.py
@@ -15,7 +15,7 @@ import sys
 sys.path.append("../")
 import unittest
 import paddle
-from paddleslim.dist import merge, fsp_loss
+from paddleslim.dist import merge, fsp
 from layers import conv_bn_layer
 from static_case import StaticCase

@@ -49,9 +49,8 @@ class TestFSPLoss(StaticCase):
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                merged_ops.append(op.type)
-        distill_loss = fsp_loss('teacher_conv1_out.tmp_1',
-                                'teacher_conv6_out.tmp_0', 'conv1_out.tmp_0',
-                                'conv2_out.tmp_0')
+        distill_loss = fsp('teacher_conv1_out.tmp_1', 'teacher_conv6_out.tmp_0',
+                           'conv1_out.tmp_0', 'conv2_out.tmp_0')
        loss_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:

--- a/tests/test_l2_loss.py
+++ b/tests/test_l2_loss.py
@@ -16,7 +16,7 @@ sys.path.append("../")
 import unittest
 import paddle
 from static_case import StaticCase
-from paddleslim.dist import merge, l2_loss
+from paddleslim.dist import merge, l2
 from layers import conv_bn_layer


@@ -48,8 +48,8 @@ class TestL2Loss(StaticCase):
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                merged_ops.append(op.type)
-        distill_loss = l2_loss('teacher_conv6_bn_output.tmp_2',
-                               'conv2_bn_output.tmp_2')
+        distill_loss = l2('teacher_conv6_bn_output.tmp_2',
+                          'conv2_bn_output.tmp_2')
        loss_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:

--- a/tests/test_quant_aware_with_infermodel.py
+++ b/tests/test_quant_aware_with_infermodel.py
@@ -17,6 +17,7 @@ sys.path.append("../")
 sys.path.append(".")
 sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
 import unittest
+import copy
 import paddle
 from paddleslim.quant import quant_aware, convert
 from paddleslim.quant import quant_aware_with_infermodel, export_quant_infermodel
@@ -145,13 +146,10 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
            "./quantaware_with_infermodel_checkpoints/",
            "teacher_model_path_prefix": float_infer_model_path_prefix,
            "model_path_prefix": float_infer_model_path_prefix,
-            "distill_node_pair": [
-                "teacher_fc_0.tmp_0", "fc_0.tmp_0",
-                "teacher_batch_norm_24.tmp_4", "batch_norm_24.tmp_4",
-                "teacher_batch_norm_22.tmp_4", "batch_norm_22.tmp_4",
-                "teacher_batch_norm_18.tmp_4", "batch_norm_18.tmp_4",
-                "teacher_batch_norm_13.tmp_4", "batch_norm_13.tmp_4",
-                "teacher_batch_norm_5.tmp_4", "batch_norm_5.tmp_4"
+            "node": [
+                "fc_0.tmp_0", "batch_norm_24.tmp_4", "batch_norm_22.tmp_4",
+                "batch_norm_18.tmp_4", "batch_norm_13.tmp_4",
+                "batch_norm_5.tmp_4"
            ]
        }

@@ -184,7 +182,7 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
                scope=None,
                train_reader=train_loader,
                quant_config=quant_config,
-                train_config=train_config,
+                train_config=copy.deepcopy(train_config),
                test_callback=test_callback)

        def test_export_quant_infermodel(exe, place, checkpoint_path,
@@ -194,7 +192,7 @@ class TestQuantAwareWithInferModelCase1(StaticCase):
                place,
                scope=None,
                quant_config=quant_config,
-                train_config=train_config,
+                train_config=copy.deepcopy(train_config),
                checkpoint_path=checkpoint_path,
                export_inference_model_path_prefix=quant_infermodel_save_path)


--- a/tests/test_soft_label_loss.py
+++ b/tests/test_soft_label_loss.py
@@ -15,7 +15,7 @@ import sys
 sys.path.append("../")
 import unittest
 import paddle
-from paddleslim.dist import merge, soft_label_loss
+from paddleslim.dist import merge, soft_label
 from layers import conv_bn_layer
 from static_case import StaticCase

@@ -48,8 +48,8 @@ class TestSoftLabelLoss(StaticCase):
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                merged_ops.append(op.type)
-        distill_loss = soft_label_loss('teacher_conv6_bn_output.tmp_2',
-                                       'conv2_bn_output.tmp_2')
+        distill_loss = soft_label('teacher_conv6_bn_output.tmp_2',
+                                  'conv2_bn_output.tmp_2')
        loss_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops: