diff --git a/README.md b/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0439d84ed9e0de80890f69992343fc55ff6d8d2b 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,61 @@
+
+
+# PaddleSlim
+
+PaddleSlim是PaddlePaddle框架的一个子模块，主要用于压缩图像领域模型。在PaddleSlim中，不仅实现了目前主流的网络剪枝、量化、蒸馏三种压缩策略，还实现了超参数搜索和小模型网络结构搜索功能。在后续版本中，会添加更多的压缩策略，以及完善对NLP领域模型的支持。
+
+## 功能
+
+- 模型剪裁
+  - 支持通道均匀模型剪裁（uniform pruning)
+  - 基于敏感度的模型剪裁
+  - 基于进化算法的自动模型剪裁三种方式
+
+- 量化训练
+  - 在线量化训练（training aware）
+  - 离线量化（post training）
+  - 支持对权重全局量化和Channel-Wise量化
+
+- 蒸馏
+
+- 轻量神经网络结构自动搜索（Light-NAS）
+  - 支持基于进化算法的轻量神经网络结构自动搜索（Light-NAS）
+  - 支持 FLOPS / 硬件延时约束
+  - 支持多平台模型延时评估
+
+
+## 安装
+
+安装PaddleSlim前，请确认已正确安装Paddle1.6版本或更新版本。Paddle安装请参考：[Paddle安装教程](https://www.paddlepaddle.org.cn/install/quick)。
+
+
+- 安装develop版本
+
+
+```
+git clone http://gitlab.baidu.com/PaddlePaddle/PaddleSlim.git
+cd PaddleSlim
+python setup.py install
+```
+
+- 安装官方发布的最新版本
+
+```
+pip install paddleslim -i https://pypi.org/simple
+```
+
+- 安装历史版本
+
+请点击[pypi.org](https://pypi.org/project/paddleslim/#history)查看可安装历史版本。
+
+## 使用
+
+- [API文档](doc/api_guide.md)：API使用介绍，包括[蒸馏]()、[剪裁]()、[量化]()和[模型结构搜索]()。
+- [示例](doc/demo_guide.md)：基于mnist和cifar10等简单分类任务的模型压缩示例，您可以通过该部分快速体验和了解PaddleSlim的功能。
+- [实践教程]()：经典模型的分析和压缩实验教程。
+- [模型库]()：经过压缩的分类、检测、语义分割模型，包括权重文件、网络结构文件和性能数据。
+- [Paddle检测库]()：介绍如何在检测库中使用PaddleSlim。
+- [Paddle分割库]()：介绍如何在分割库中使用PaddleSlim。
+- [PaddleLite]()：介绍如何使用预测库PaddleLite部署PaddleSlim产出的模型。
+
+## 贡献与反馈
diff --git a/demo/distillation/train.py b/demo/distillation/distillation_demo.py
similarity index 74%
rename from demo/distillation/train.py
rename to demo/distillation/distillation_demo.py
index 7f389168440a59f0872d44ab6e62f262e373f6f0..3f47553e541ff86ae0a6f4d86c046a1dee66a03f 100644
--- a/demo/distillation/train.py
+++ b/demo/distillation/distillation_demo.py
@@ -13,8 +13,7 @@ import numpy as np
 import paddle.fluid as fluid
 sys.path.append(sys.path[0] + "/../")
 import models
-import imagenet_reader as reader
-from utility import add_arguments, print_arguments
+from utility import add_arguments, print_arguments, _download, _decompress
 from paddleslim.dist import merge, l2_loss, soft_label_loss, fsp_loss
 
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
@@ -33,12 +32,12 @@ add_arg('lr_strategy',      str,  "piecewise_decay",   "The learning rate decay
 add_arg('l2_decay',         float,  3e-5,               "The l2_decay parameter.")
 add_arg('momentum_rate',    float,  0.9,               "The value of momentum_rate.")
 add_arg('num_epochs',       int,  120,               "The number of total epochs.")
-add_arg('data',             str, "mnist",                 "Which data to use. 'mnist' or 'imagenet'")
+add_arg('data',             str, "cifar10",                 "Which data to use. 'cifar10' or 'imagenet'")
 add_arg('log_period',       int, 20,                 "Log period in batches.")
 add_arg('model',            str,  "MobileNet",          "Set the network to use.")
 add_arg('pretrained_model', str,  None,                "Whether to use pretrained model.")
 add_arg('teacher_model',    str,  "ResNet50",          "Set the teacher network to use.")
-add_arg('teacher_pretrained_model', str,  "../pretrain/ResNet50_pretrained",                "Whether to use pretrained model.")
+add_arg('teacher_pretrained_model', str,  "./ResNet50_pretrained",                "Whether to use pretrained model.")
 parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step")
 # yapf: enable
 
@@ -76,12 +75,12 @@ def create_optimizer(args):
 
 
 def compress(args):
-    if args.data == "mnist":
-        import paddle.dataset.mnist as reader
-        train_reader = reader.train()
-        val_reader = reader.test()
+    if args.data == "cifar10":
+        import paddle.dataset.cifar as reader
+        train_reader = reader.train10()
+        val_reader = reader.test10()
         class_dim = 10
-        image_shape = "1,28,28"
+        image_shape = "3,32,32"
     elif args.data == "imagenet":
         import imagenet_reader as reader
         train_reader = reader.train()
@@ -132,7 +131,7 @@ def compress(args):
         val_reader, batch_size=args.batch_size, drop_last=True)
     val_program = student_program.clone(for_test=True)
 
-    places = fluid.cuda_places()
+    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
     train_loader.set_sample_list_generator(train_reader, places)
     valid_loader.set_sample_list_generator(val_reader, place)
 
@@ -140,55 +139,44 @@ def compress(args):
     # define teacher program
     teacher_program = fluid.Program()
     t_startup = fluid.Program()
-    teacher_scope = fluid.Scope()
-    with fluid.scope_guard(teacher_scope):
-        with fluid.program_guard(teacher_program, t_startup):
-            with fluid.unique_name.guard():
-                image = fluid.layers.data(
-                    name='image', shape=image_shape, dtype='float32')
-                predict = teacher_model.net(image, class_dim=class_dim)
-
-            #print("="*50+"teacher_model_params"+"="*50)
-            #for v in teacher_program.list_vars():
-            #    print(v.name, v.shape)
-
-        exe.run(t_startup)
-        assert args.teacher_pretrained_model and os.path.exists(
-            args.teacher_pretrained_model
-        ), "teacher_pretrained_model should be set when teacher_model is not None."
-
-        def if_exist(var):
-            return os.path.exists(
-                os.path.join(args.teacher_pretrained_model, var.name)
-            ) and var.name != 'conv1_weights' and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0'
-
-        fluid.io.load_vars(
-            exe,
-            args.teacher_pretrained_model,
-            main_program=teacher_program,
-            predicate=if_exist)
+    with fluid.program_guard(teacher_program, t_startup):
+        with fluid.unique_name.guard():
+            image = fluid.layers.data(
+                name='image', shape=image_shape, dtype='float32')
+            predict = teacher_model.net(image, class_dim=class_dim)
+
+    #print("="*50+"teacher_model_params"+"="*50)
+    #for v in teacher_program.list_vars():
+    #    print(v.name, v.shape)
+
+    exe.run(t_startup)
+    _download('http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', '.')
+    _decompress('./ResNet50_pretrained.tar')
+    assert args.teacher_pretrained_model and os.path.exists(
+        args.teacher_pretrained_model
+    ), "teacher_pretrained_model should be set when teacher_model is not None."
+
+    def if_exist(var):
+        return os.path.exists(
+            os.path.join(args.teacher_pretrained_model, var.name)
+        ) and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0'
+
+    fluid.io.load_vars(
+        exe,
+        args.teacher_pretrained_model,
+        main_program=teacher_program,
+        predicate=if_exist)
 
     data_name_map = {'image': 'image'}
     main = merge(
         teacher_program,
         student_program,
         data_name_map,
-        place,
-        teacher_scope=teacher_scope)
-
-    #print("="*50+"teacher_vars"+"="*50)
-    #for v in teacher_program.list_vars():
-    #    if '_generated_var' not in v.name and 'fetch' not in v.name and 'feed' not in v.name:
-    #        print(v.name, v.shape)
-    #return
+        place)
 
     with fluid.program_guard(main, s_startup):
-        l2_loss_v = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main)
-        fsp_loss_v = fsp_loss("teacher_res2a_branch2a.conv2d.output.1.tmp_0",
-                              "teacher_res3a_branch2a.conv2d.output.1.tmp_0",
-                              "depthwise_conv2d_1.tmp_0", "conv2d_3.tmp_0",
-                              main)
-        loss = avg_cost + l2_loss_v + fsp_loss_v
+        l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main)
+        loss = avg_cost + l2_loss
         opt = create_optimizer(args)
         opt.minimize(loss)
     exe.run(s_startup)
@@ -199,17 +187,16 @@ def compress(args):
 
     for epoch_id in range(args.num_epochs):
         for step_id, data in enumerate(train_loader):
-            loss_1, loss_2, loss_3, loss_4 = exe.run(
+            loss_1, loss_2, loss_3 = exe.run(
                 parallel_main,
                 feed=data,
                 fetch_list=[
-                    loss.name, avg_cost.name, l2_loss_v.name, fsp_loss_v.name
+                    loss.name, avg_cost.name, l2_loss.name
                 ])
             if step_id % args.log_period == 0:
                 _logger.info(
-                    "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}, fsp loss {:.6f}".
-                    format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0],
-                           loss_4[0]))
+                    "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}".
+                    format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0]))
         val_acc1s = []
         val_acc5s = []
         for step_id, data in enumerate(valid_loader):
diff --git a/demo/quant/quant_aware/README.md b/demo/quant/quant_aware/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5fae50c5ff752c36863bfa57a9a9f08135b90f00
--- /dev/null
+++ b/demo/quant/quant_aware/README.md
@@ -0,0 +1,77 @@
+# 在线量化示例
+
+本示例介绍如何使用在线量化接口，来对训练好的分类模型进行量化, 可以减少模型的存储空间和显存占用。
+
+## 接口介绍
+
+请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
+
+## 分类模型的离线量化流程
+
+### 1. 配置量化参数
+
+```
+quant_config = {
+    'weight_quantize_type': 'abs_max',
+    'activation_quantize_type': 'moving_average_abs_max',
+    'weight_bits': 8,
+    'activation_bits': 8,
+    'not_quant_pattern': ['skip_quant'],
+    'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+    'dtype': 'int8',
+    'window_size': 10000,
+    'moving_rate': 0.9,
+    'quant_weight_only': False
+}
+```
+
+### 2. 对训练和测试program插入可训练量化op
+
+```
+val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
+
+compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
+```
+
+### 3.关掉指定build策略
+
+```
+build_strategy = fluid.BuildStrategy()
+build_strategy.fuse_all_reduce_ops = False
+build_strategy.sync_batch_norm = False
+exec_strategy = fluid.ExecutionStrategy()
+compiled_train_prog = compiled_train_prog.with_data_parallel(
+        loss_name=avg_cost.name,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+```
+
+### 4. freeze program
+
+```
+float_program, int8_program = convert(val_program, 
+                                      place,
+                                      quant_config,
+                                      scope=None,
+                                      save_int8=True)
+```
+
+### 5.保存预测模型
+
+```
+fluid.io.save_inference_model(
+    dirname=float_path,
+    feeded_var_names=[image.name],
+    target_vars=[out], executor=exe,
+    main_program=float_program,
+    model_filename=float_path + '/model',
+    params_filename=float_path + '/params')
+
+fluid.io.save_inference_model(
+    dirname=int8_path,
+    feeded_var_names=[image.name],
+    target_vars=[out], executor=exe,
+    main_program=int8_program,
+    model_filename=int8_path + '/model',
+    params_filename=int8_path + '/params')
+```
diff --git a/demo/quant/quant_aware/train.py b/demo/quant/quant_aware/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b1aa72c062e1c8ceebdfb92e2f80df80246a58
--- /dev/null
+++ b/demo/quant/quant_aware/train.py
@@ -0,0 +1,276 @@
+import os
+import sys
+import logging
+import paddle
+import argparse
+import functools
+import math
+import time
+import numpy as np
+import paddle.fluid as fluid
+sys.path.append(sys.path[0] + "../../../")
+sys.path.append(sys.path[0] + "../../")
+from paddleslim.common import get_logger
+from paddleslim.analysis import flops
+from paddleslim.quant import quant_aware, quant_post, convert
+import models
+from utility import add_arguments, print_arguments
+
+quantization_model_save_dir = './quantization_models/'
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,  64 * 4,                 "Minibatch size.")
+add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('model',            str,  "MobileNet",                "The target model.")
+add_arg('pretrained_model', str,  "../pretrained_model/MobileNetV1_pretrained",                "Whether to use pretrained model.")
+add_arg('lr',               float,  0.0001,               "The learning rate used to fine-tune pruned model.")
+add_arg('lr_strategy',      str,  "piecewise_decay",   "The learning rate decay strategy.")
+add_arg('l2_decay',         float,  3e-5,               "The l2_decay parameter.")
+add_arg('momentum_rate',    float,  0.9,               "The value of momentum_rate.")
+add_arg('num_epochs',       int,  1,               "The number of total epochs.")
+add_arg('total_images',     int,  1281167,               "The number of total training images.")
+parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step")
+add_arg('config_file',      str, None,                 "The config file for compression with yaml format.")
+add_arg('data',             str, "imagenet",             "Which data to use. 'mnist' or 'imagenet'")
+add_arg('log_period',       int, 10,                 "Log period in batches.")
+add_arg('test_period',      int, 10,                 "Test period in epoches.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+
+
+def piecewise_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    bd = [step * e for e in args.step_epochs]
+    lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
+    learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def cosine_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    learning_rate = fluid.layers.cosine_decay(
+        learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def create_optimizer(args):
+    if args.lr_strategy == "piecewise_decay":
+        return piecewise_decay(args)
+    elif args.lr_strategy == "cosine_decay":
+        return cosine_decay(args)
+
+
+def compress(args):
+    ############################################################################################################
+    # 1. quantization configs
+    ############################################################################################################
+    quant_config = {
+        # weight quantize type, default is 'abs_max'
+        'weight_quantize_type': 'abs_max',
+        # activation quantize type, default is 'abs_max'
+        'activation_quantize_type': 'moving_average_abs_max',
+        # weight quantize bit num, default is 8
+        'weight_bits': 8,
+        # activation quantize bit num, default is 8
+        'activation_bits': 8,
+        # op of name_scope in not_quant_pattern list, will not quantized
+        'not_quant_pattern': ['skip_quant'],
+        # op of types in quantize_op_types, will quantized
+        'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+        # data type after quantization, default is 'int8'
+        'dtype': 'int8',
+        # window size for 'range_abs_max' quantization. defaulf is 10000
+        'window_size': 10000,
+        # The decay coefficient of moving average, default is 0.9
+        'moving_rate': 0.9,
+        # if set quant_weight_only True, then only quantize parameters of layers which need quantization,
+        # and insert anti-quantization op for parameters of these layers.
+        'quant_weight_only': False
+    }
+
+    train_reader = None
+    test_reader = None
+    if args.data == "mnist":
+        import paddle.dataset.mnist as reader
+        train_reader = reader.train()
+        val_reader = reader.test()
+        class_dim = 10
+        image_shape = "1,28,28"
+    elif args.data == "imagenet":
+        import imagenet_reader as reader
+        train_reader = reader.train()
+        val_reader = reader.val()
+        class_dim = 1000
+        image_shape = "3,224,224"
+    else:
+        raise ValueError("{} is not supported.".format(args.data))
+
+    image_shape = [int(m) for m in image_shape.split(",")]
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # model definition
+    model = models.__dict__[args.model]()
+    out = model.net(input=image, class_dim=class_dim)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    train_prog = fluid.default_main_program()
+    val_program = fluid.default_main_program().clone(for_test=True)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    ############################################################################################################
+    # 2. quantization transform programs (training aware)
+    #    Make some quantization transforms in the graph before training and testing.
+    #    According to the weight and activation quantization type, the graph will be added
+    #    some fake quantize operators and fake dequantize operators.
+    ############################################################################################################
+    val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
+    compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
+    opt = create_optimizer(args)
+    opt.minimize(avg_cost)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if args.pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(args.pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
+
+    train_feeder = feeder = fluid.DataFeeder([image, label], place)
+    val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=val_program)
+
+    def test(epoch, program):
+        batch_id = 0
+        acc_top1_ns = []
+        acc_top5_ns = []
+        for data in val_reader():
+            start_time = time.time()
+            acc_top1_n, acc_top5_n = exe.run(
+                program,
+                feed=train_feeder.feed(data),
+                fetch_list=[acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id,
+                           np.mean(acc_top1_n),
+                           np.mean(acc_top5_n), end_time - start_time))
+            acc_top1_ns.append(np.mean(acc_top1_n))
+            acc_top5_ns.append(np.mean(acc_top5_n))
+            batch_id += 1
+
+        _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".
+                     format(epoch,
+                            np.mean(np.array(acc_top1_ns)),
+                            np.mean(np.array(acc_top5_ns))))
+        return np.mean(np.array(acc_top1_ns))
+
+    def train(epoch, compiled_train_prog):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        build_strategy.fuse_all_reduce_ops = False
+        build_strategy.sync_batch_norm = False
+        exec_strategy = fluid.ExecutionStrategy()
+        compiled_train_prog = compiled_train_prog.with_data_parallel(
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+
+        batch_id = 0
+        for data in train_reader():
+            start_time = time.time()
+            loss_n, acc_top1_n, acc_top5_n = exe.run(
+                compiled_train_prog,
+                feed=train_feeder.feed(data),
+                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            loss_n = np.mean(loss_n)
+            acc_top1_n = np.mean(acc_top1_n)
+            acc_top5_n = np.mean(acc_top5_n)
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "epoch[{}]-batch[{}] - loss: {}; acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id, loss_n, acc_top1_n, acc_top5_n,
+                           end_time - start_time))
+            batch_id += 1
+
+    ############################################################################################################
+    # train loop
+    ############################################################################################################
+    for i in range(args.num_epochs):
+        train(i, compiled_train_prog)
+        if i % args.test_period == 0:
+            test(i, val_program)
+
+    ############################################################################################################
+    # 3. Freeze the graph after training by adjusting the quantize
+    #    operators' order for the inference.
+    #    The dtype of float_program's weights is float32, but in int8 range.
+    ############################################################################################################
+    float_program, int8_program = convert(val_program, place, quant_config, \
+                                                        scope=None, \
+                                                        save_int8=True)
+
+    ############################################################################################################
+    # 4. Save inference model
+    ############################################################################################################
+    model_path = os.path.join(quantization_model_save_dir, args.model,
+                              'act_' + quant_config['activation_quantize_type'] + '_w_' + quant_config[
+                                  'weight_quantize_type'])
+    float_path = os.path.join(model_path, 'float')
+    int8_path = os.path.join(model_path, 'int8')
+    if not os.path.isdir(model_path):
+        os.makedirs(model_path)
+
+    fluid.io.save_inference_model(
+        dirname=float_path,
+        feeded_var_names=[image.name],
+        target_vars=[out], executor=exe,
+        main_program=float_program,
+        model_filename=float_path + '/model',
+        params_filename=float_path + '/params')
+
+    fluid.io.save_inference_model(
+        dirname=int8_path,
+        feeded_var_names=[image.name],
+        target_vars=[out], executor=exe,
+        main_program=int8_program,
+        model_filename=int8_path + '/model',
+        params_filename=int8_path + '/params')
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    compress(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quant/quant_embedding/README.md b/demo/quant/quant_embedding/README.md
index 5667b19a7f27062dc508a68569ae9fb86d178b45..422ef5b6ecbf96a356dfb6e8943d2863f6da5e23 100755
--- a/demo/quant/quant_embedding/README.md
+++ b/demo/quant/quant_embedding/README.md
@@ -2,22 +2,8 @@
 
 本示例介绍如何使用Embedding量化的接口 [paddleslim.quant.quant_embedding]() 。``quant_embedding``接口将网络中的Embedding参数从``float32``类型量化到 ``8-bit``整数类型，在几乎不损失模型精度的情况下减少模型的存储空间和显存占用。
 
-接口如下：
-```
-quant_embedding(program, place, config, scope=None)
-```
-
-参数介绍：
 
-- program(fluid.Program) : 需要量化的program
-- scope(fluid.Scope, optional) : 用来获取和写入``Variable``, 如果设置为``None``,则使用``fluid.global_scope()``.
-- place(fluid.CPUPlace or fluid.CUDAPlace): 运行program的设备
-- config(dict) : 定义量化的配置。可以配置的参数有：
-    - ``'params_name'`` (str, required): 需要进行量化的参数名称，此参数必须设置。
-    - ``'quantize_type'`` (str, optional): 量化的类型，目前支持的类型是``'abs_max'``, 待支持的类型有 ``'log', 'product_quantization'``。 默认值是``'abs_max'``.
-    - ``'quantize_bits'``（int, optional): 量化的``bit``数，目前支持的``bit``数为8。默认值是8.
-    - ``'dtype'``(str, optional): 量化之后的数据类型， 目前支持的是``'int8'``. 默认值是``int8``。
-    - ``'threshold'``(float, optional): 量化之前将根据此阈值对需要量化的参数值进行``clip``. 如果不设置，则跳过``clip``过程直接量化。
+接口介绍请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
 
 该接口对program的修改：
 
diff --git a/demo/quant/quant_post/README.md b/demo/quant/quant_post/README.md
index 0bab00e7b885a807d7e1d19df8b0053f574e420a..72cd68781d6de71aca19d3b34f1daf187494f371 100755
--- a/demo/quant/quant_post/README.md
+++ b/demo/quant/quant_post/README.md
@@ -3,32 +3,8 @@
 本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。
 
 ## 接口介绍
-```
-quant_post(executor,
-           model_dir,
-           quantize_model_path,
-           sample_generator,
-           model_filename=None,
-           params_filename=None,
-           batch_size=16,
-           batch_nums=None,
-           scope=None,
-           algo='KL',
-           quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"])
-```
 
-参数介绍：
-- executor (fluid.Executor): 执行模型的executor，可以在cpu或者gpu上执行。
-- model_dir（str): 需要量化的模型所在的文件夹。
-- quantize_model_path(str): 保存量化后的模型的路径
-- sample_generator(python generator): 读取数据样本，每次返回一个样本。
-- model_filename(str, optional): 模型文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``model_filename``为模型文件的名称，否则设置为``None``即可。默认值是``None``。
-- params_filename(str): 参数文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``params_filename``为参数文件的名称，否则设置为``None``即可。默认值是``None``。
-- batch_size(int): 每个batch的图片数量。默认值为16 。
-- batch_nums(int, optional): 迭代次数。如果设置为``None``，则会一直运行到``sample_generator`` 迭代结束， 否则，迭代次数为``batch_nums``, 也就是说参与对``Scale``进行校正的样本个数为 ``'batch_nums' * 'batch_size' ``.
-- scope(fluid.Scope, optional): 用来获取和写入``Variable``, 如果设置为``None``,则使用``fluid.global_scope()``. 默认值是``None``.
-- algo(str): 量化时使用的算法名称，可为``'KL'``或者``'direct'``。该参数仅针对激活值的量化，因为参数值的量化使用的方式为``'channel_wise_abs_max'``. 当``algo`` 设置为``'direct'``时，使用``'abs_max'``计算``Scale``值，当设置为``'KL'``时，则使用``KL``散度的方法来计算``Scale``值。默认值为``'KL'``。
-- quantizable_op_type(list[str]): 需要量化的``op``类型列表。默认值为``["conv2d", "depthwise_conv2d", "mul"]``。
+请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
 
 ## 分类模型的离线量化流程
 
diff --git a/demo/utility.py b/demo/utility.py
index dd52f69457c9f8d94920b85dc09b58ff8e605a64..475468f2777ae40427465327ff7b78355cfcbbeb 100644
--- a/demo/utility.py
+++ b/demo/utility.py
@@ -20,6 +20,12 @@ import distutils.util
 import os
 import numpy as np
 import six
+import requests
+import shutil
+import tqdm
+import hashlib
+import tarfile
+import zipfile
 import logging
 import paddle.fluid as fluid
 import paddle.compat as cpt
@@ -30,6 +36,7 @@ logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
 
+DOWNLOAD_RETRY_LIMIT=3
 
 def print_arguments(args):
     """Print argparse's arguments.
@@ -154,3 +161,122 @@ def load_persistable_nodes(executor, dirname, graph):
         else:
             _logger.info("Cannot find the var %s!!!" % (node.name()))
     fluid.io.load_vars(executor=executor, dirname=dirname, vars=var_list)
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not (os.path.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        _logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    _logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        _logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    _logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    for f in os.listdir(fpath_tmp):
+        src_dir = os.path.join(fpath_tmp, f)
+        dst_dir = os.path.join(fpath, f)
+        _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
diff --git a/doc/analysis_api.md b/doc/analysis_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..61476531b2a0c75cbc1e27d870110ee15c8c31f8
--- /dev/null
+++ b/doc/analysis_api.md
@@ -0,0 +1,169 @@
+# 模型分析API文档
+
+## flops
+
+>paddleslim.analysis.flops(program, detail=False) [源代码]()
+
+获得指定网络的每秒浮点运算次数(FLOPS)。
+
+**参数：**
+
+- **program(paddle.fluid.Program):**  待分析的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
+
+- **detail(bool):** 是否返回每个卷积层的FLOPS。默认为False。
+
+**返回值：**
+
+- **flops(float):** 整个网络的FLOPS。
+
+- **params2flops(dict):** 每层卷积对应的FLOPS，其中key为卷积层参数名称，value为FLOPS值。
+
+**示例：**
+
+```
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.analysis import flops
+
+def conv_bn_layer(input,
+                  num_filters,
+                  filter_size,
+                  name,
+                  stride=1,
+                  groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        param_attr=ParamAttr(name=name + "_weights"),
+        bias_attr=False,
+        name=name + "_out")
+    bn_name = name + "_bn"
+    return fluid.layers.batch_norm(
+        input=conv,
+        act=act,
+        name=bn_name + '_output',
+        param_attr=ParamAttr(name=bn_name + '_scale'),
+        bias_attr=ParamAttr(bn_name + '_offset'),
+        moving_mean_name=bn_name + '_mean',
+        moving_variance_name=bn_name + '_variance', )
+
+main_program = fluid.Program()
+startup_program = fluid.Program()
+#   X       X              O       X              O
+# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+#     |            ^ |                    ^
+#     |____________| |____________________|
+#
+# X: prune output channels
+# O: prune input channels
+with fluid.program_guard(main_program, startup_program):
+    input = fluid.data(name="image", shape=[None, 3, 16, 16])
+    conv1 = conv_bn_layer(input, 8, 3, "conv1")
+    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+    sum1 = conv1 + conv2
+    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+    sum2 = conv4 + sum1
+    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+
+print("FLOPS: {}".format(flops(main_program)))
+```
+
+## model_size
+
+>paddleslim.analysis.model_size(program) [源代码]()
+
+获得指定网络的参数数量。
+
+**参数：**
+
+- **program(paddle.fluid.Program):**  待分析的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
+
+**返回值：**
+
+- **model_size(int):** 整个网络的参数数量。
+
+**示例：**
+
+```
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.analysis import model_size
+
+def conv_layer(input,
+                  num_filters,
+                  filter_size,
+                  name,
+                  stride=1,
+                  groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        param_attr=ParamAttr(name=name + "_weights"),
+        bias_attr=False,
+        name=name + "_out")
+    return conv
+
+main_program = fluid.Program()
+startup_program = fluid.Program()
+#   X       X              O       X              O
+# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+#     |            ^ |                    ^
+#     |____________| |____________________|
+#
+# X: prune output channels
+# O: prune input channels
+with fluid.program_guard(main_program, startup_program):
+    input = fluid.data(name="image", shape=[None, 3, 16, 16])
+    conv1 = conv_layer(input, 8, 3, "conv1")
+    conv2 = conv_layer(conv1, 8, 3, "conv2")
+    sum1 = conv1 + conv2
+    conv3 = conv_layer(sum1, 8, 3, "conv3")
+    conv4 = conv_layer(conv3, 8, 3, "conv4")
+    sum2 = conv4 + sum1
+    conv5 = conv_layer(sum2, 8, 3, "conv5")
+    conv6 = conv_layer(conv5, 8, 3, "conv6")
+
+print("FLOPS: {}".format(model_size(main_program)))
+```
+
+## TableLatencyEvaluator
+
+>paddleslim.analysis.TableLatencyEvaluator(table_file, delimiter=",") [源代码]()
+
+基于硬件延时表的模型延时评估器。
+
+**参数：**
+
+- **table_file(str):** 所使用的延时评估表的绝对路径。关于演示评估表格式请参考：[PaddleSlim硬件延时评估表格式](../paddleslim/analysis/table_latency.md)
+
+- **delimiter(str):** 硬件延时评估表中，操作信息之前所使用的分割符，默认为英文字符逗号。
+
+**返回值：**
+
+- **Evaluator:** 硬件延时评估器的实例。
+
+>paddleslim.analysis.TableLatencyEvaluator.latency(graph) [源代码]()
+
+获得指定网络的预估延时。
+
+**参数：**
+
+- **graph(Program):** 待预估的目标网络。
+
+**返回值：**
+
+- **latency:** 目标网络的预估延时。
diff --git a/doc/api_guide.md b/doc/api_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..467a4926d7e1132f565e3f98339e9204bf5dee5d
--- /dev/null
+++ b/doc/api_guide.md
@@ -0,0 +1,20 @@
+
+## [模型分析](./analysis_api.md)
+
+## [卷积通道剪裁](./prune_api.md)
+
+## [蒸馏]()
+
+- [单进程蒸馏](../paddleslim/dist/single_distiller_api_doc.md)
+
+- [通道剪裁](../paddleslim/prune/prune_api.md)
+
+### [量化](../paddleslim/quant/quantization_api_doc.md)
+
+- [量化训练](../paddleslim/quant/quantization_api_doc.md#量化训练API)
+
+- [离线量化](../paddleslim/quant/quantization_api_doc.md#离线量化API)
+
+- [embedding量化](../paddleslim/quant/quantization_api_doc.md#Embedding量化API)
+
+## [小模型结构搜索]()
diff --git a/doc/demo_guide.md b/doc/demo_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..6329a96debbaffa8a832618ced9f37c4326dfd71
--- /dev/null
+++ b/doc/demo_guide.md
@@ -0,0 +1,14 @@
+
+## [蒸馏](../demo/distillation/distillation_demo.py)
+
+蒸馏demo默认使用ResNet50作为teacher网络，MobileNet作为student网络，此外还支持将teacher和student换成[models目录](../demo/models)支持的任意模型。
+
+demo中对teahcer模型和student模型的一层特征图添加了l2_loss的蒸馏损失函数，使用时也可根据需要选择fsp_loss, soft_label_loss以及自定义的loss函数。
+
+训练默认使用的是cifar10数据集，piecewise_decay学习率衰减策略，momentum优化器进行120轮蒸馏训练。使用者也可以简单地用args参数切换为使用ImageNet数据集，cosine_decay学习率衰减策略等其他训练配置。
+
+## 量化
+
+### [量化训练demo文档](../demo/quant/quant_aware/README.md)
+### [离线量化demo文档](../demo/quant/quant_post/README.md)
+### [Embedding量化demo文档](../demo/quant/quant_embedding/README.md)
diff --git a/doc/prune_api.md b/doc/prune_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb88eb468c2725fd88a12fb8b63fa6575cb8ab5c
--- /dev/null
+++ b/doc/prune_api.md
@@ -0,0 +1,287 @@
+
+# 卷积通道剪裁API文档
+
+## class Pruner
+
+---
+
+>paddleslim.prune.Pruner(criterion="l1_norm")[源代码]()
+
+对卷积网络的通道进行一次剪裁。剪裁一个卷积层的通道，是指剪裁该卷积层输出的通道。卷积层的权重形状为`[output_channel, input_channel, kernel_size, kernel_size]`，通过剪裁该权重的第一纬度达到剪裁输出通道数的目的。
+
+**参数：**
+
+- **criterion:** 评估一个卷积层内通道重要性所参考的指标。目前仅支持`l1_norm`。默认为`l1_norm`。
+
+**返回：** 一个Pruner类的实例
+
+**示例代码：**
+
+```
+from paddleslim.prune import Pruner
+pruner = Pruner()
+```
+
+---
+
+>prune(program, scope, params, ratios, place=None, lazy=False, only_graph=False, param_backup=None, param_shape_backup=None)
+
+对目标网络的一组卷积层的权重进行裁剪。
+
+**参数：**
+
+- **program(paddle.fluid.Program):** 要裁剪的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
+
+- **scope(paddle.fluid.Scope):** 要裁剪的权重所在的`scope`，Paddle中用`scope`实例存放模型参数和运行时变量的值。Scope中的参数值会被`inplace`的裁剪。更多介绍请参考[Scope概念介绍]()
+
+- **params(list<str>):** 需要被裁剪的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
+```
+for block in program.blocks:
+    for param in block.all_parameters():
+        print("param: {}; shape: {}".format(param.name, param.shape))
+```
+
+- **ratios(list<float>):** 用于裁剪`params`的剪切率，类型为列表。该列表长度必须与`params`的长度一致。
+
+- **place(paddle.fluid.Place):** 待裁剪参数所在的设备位置，可以是`CUDAPlace`或`CPUPLace`。[Place概念介绍]()
+
+- **lazy(bool):** `lazy`为True时，通过将指定通道的参数置零达到裁剪的目的，参数的`shape保持不变`；`lazy`为False时，直接将要裁的通道的参数删除，参数的`shape`会发生变化。
+
+- **only_graph(bool):** 是否只裁剪网络结构。在Paddle中，Program定义了网络结构，Scope存储参数的数值。一个Scope实例可以被多个Program使用，比如定义了训练网络的Program和定义了测试网络的Program是使用同一个Scope实例的。`only_graph`为True时，只对Program中定义的卷积的通道进行剪裁；`only_graph`为false时，Scope中卷积参数的数值也会被剪裁。默认为False。
+
+- **param_backup(bool):** 是否返回对参数值的备份。默认为False。
+
+- **param_shape_backup(bool):** 是否返回对参数`shape`的备份。
+
+**返回：**
+
+- **pruned_program(paddle.fluid.Program):** 被裁剪后的Program。
+
+- **param_backup(dict):** 对参数数值的备份，用于恢复Scope中的参数数值。
+
+- **param_shape_backup(dict):** 对参数形状的备份。
+
+**示例：**
+
+点击[AIStudio](https://aistudio.baidu.com/aistudio/projectDetail/200786)执行以下示例代码。
+```
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.prune import Pruner
+
+def conv_bn_layer(input,
+                  num_filters,
+                  filter_size,
+                  name,
+                  stride=1,
+                  groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        param_attr=ParamAttr(name=name + "_weights"),
+        bias_attr=False,
+        name=name + "_out")
+    bn_name = name + "_bn"
+    return fluid.layers.batch_norm(
+        input=conv,
+        act=act,
+        name=bn_name + '_output',
+        param_attr=ParamAttr(name=bn_name + '_scale'),
+        bias_attr=ParamAttr(bn_name + '_offset'),
+        moving_mean_name=bn_name + '_mean',
+        moving_variance_name=bn_name + '_variance', )
+
+main_program = fluid.Program()
+startup_program = fluid.Program()
+#   X       X              O       X              O
+# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+#     |            ^ |                    ^
+#     |____________| |____________________|
+#
+# X: prune output channels
+# O: prune input channels
+with fluid.program_guard(main_program, startup_program):
+    input = fluid.data(name="image", shape=[None, 3, 16, 16])
+    conv1 = conv_bn_layer(input, 8, 3, "conv1")
+    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+    sum1 = conv1 + conv2
+    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+    sum2 = conv4 + sum1
+    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+scope = fluid.Scope()
+exe.run(startup_program, scope=scope)
+pruner = Pruner()
+main_program, _, _ = pruner.prune(
+    main_program,
+    scope,
+    params=["conv4_weights"],
+    ratios=[0.5],
+    place=place,
+    lazy=False,
+    only_graph=False,
+    param_backup=None,
+    param_shape_backup=None)
+
+for param in main_program.global_block().all_parameters():
+    if "weights" in param.name:
+        print("param name: {}; param shape: {}".format(param.name, param.shape))
+
+```
+
+
+---
+
+## sensitivity
+
+>paddleslim.prune.sensitivity(program, place, param_names, eval_func, sensitivities_file=None, pruned_ratios=None) [源代码]()
+
+计算网络中每个卷积层的敏感度。每个卷积层的敏感度信息统计方法为：依次剪掉当前卷积层不同比例的输出通道数，在测试集上计算剪裁后的精度损失。得到敏感度信息后，可以通过观察或其它方式确定每层卷积的剪裁率。
+
+**参数：**
+
+- **program(paddle.fluid.Program):** 待评估的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
+
+- **place(paddle.fluid.Place):** 待分析的参数所在的设备位置，可以是`CUDAPlace`或`CPUPLace`。[Place概念介绍]()
+
+- **param_names(list<str>):** 待分析的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
+
+```
+for block in program.blocks:
+    for param in block.all_parameters():
+        print("param: {}; shape: {}".format(param.name, param.shape))
+```
+
+- **eval_func(function):** 用于评估裁剪后模型效果的回调函数。该回调函数接受被裁剪后的`program`为参数，返回一个表示当前program的精度，用以计算当前裁剪带来的精度损失。
+
+- **sensitivities_file(str):** 保存敏感度信息的本地文件系统的文件。在敏感度计算过程中，会持续将新计算出的敏感度信息追加到该文件中。重启任务后，文件中已有敏感度信息不会被重复计算。该文件可以用`pickle`加载。
+
+- **pruned_ratios(list<float>):** 计算卷积层敏感度信息时，依次剪掉的通道数比例。默认为[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]。
+
+**返回：**
+
+- **sensitivities(dict):** 存放敏感度信息的dict，其格式为：
+
+```
+{"weight_0": 
+   {"loss": [0.22, 0.33],
+    "pruned_percent": [0.1, 0.2]
+   },
+ "weight_1":
+   {"loss": [0.21, 0.4],
+    "pruned_percent": [0.1, 0.2]
+   }
+}
+```
+
+其中，`weight_0`是卷积层参数的名称，`weight_0`对应的`loss[i]`为将`weight_0`裁掉`pruned_percent[i]`后的精度损失。
+
+**示例：**
+
+点击[AIStudio](https://aistudio.baidu.com/aistudio/projectdetail/201401)运行以下示例代码。
+
+```
+import paddle
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.prune import sensitivity
+import paddle.dataset.mnist as reader
+
+def conv_bn_layer(input,
+                  num_filters,
+                  filter_size,
+                  name,
+                  stride=1,
+                  groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        param_attr=ParamAttr(name=name + "_weights"),
+        bias_attr=False,
+        name=name + "_out")
+    bn_name = name + "_bn"
+    return fluid.layers.batch_norm(
+        input=conv,
+        act=act,
+        name=bn_name + '_output',
+        param_attr=ParamAttr(name=bn_name + '_scale'),
+        bias_attr=ParamAttr(bn_name + '_offset'),
+        moving_mean_name=bn_name + '_mean',
+        moving_variance_name=bn_name + '_variance', )
+
+main_program = fluid.Program()
+startup_program = fluid.Program()
+#   X       X              O       X              O
+# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+#     |            ^ |                    ^
+#     |____________| |____________________|
+#
+# X: prune output channels
+# O: prune input channels
+image_shape = [1,28,28]
+with fluid.program_guard(main_program, startup_program):
+    image = fluid.data(name='image', shape=[None]+image_shape, dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')   
+    conv1 = conv_bn_layer(image, 8, 3, "conv1")
+    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+    sum1 = conv1 + conv2
+    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+    sum2 = conv4 + sum1
+    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+    out = fluid.layers.fc(conv6, size=10, act="softmax")
+#    cost = fluid.layers.cross_entropy(input=out, label=label)
+#    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+#    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+ 
+   
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(startup_program)
+
+val_reader = paddle.batch(reader.test(), batch_size=128)
+val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=main_program)
+
+def eval_func(program):
+
+    acc_top1_ns = []
+    for data in val_reader():
+        acc_top1_n = exe.run(program,
+                             feed=val_feeder.feed(data),
+                             fetch_list=[acc_top1.name])
+        acc_top1_ns.append(np.mean(acc_top1_n))
+    return np.mean(acc_top1_ns)
+param_names = []
+for param in main_program.global_block().all_parameters():
+    if "weights" in param.name:
+        param_names.append(param.name)
+sensitivities = sensitivity(main_program,
+                            place,
+                            param_names,
+                            eval_func,
+                            sensitivities_file="./sensitive.data",
+                            pruned_ratios=[0.1, 0.2, 0.3])
+print(sensitivities)
+
+```
diff --git a/paddleslim/analysis/__init__.py b/paddleslim/analysis/__init__.py
index 2b4889df45a8a58d9d1d36d2d247b12864aac7f5..64eb8dc210a8d1cbaf2541c053dacd7882fc7ca7 100644
--- a/paddleslim/analysis/__init__.py
+++ b/paddleslim/analysis/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 from .flops import flops
 from .model_size import model_size
+from .latency import LatencyEvaluator, TableLatencyEvaluator
 
-__all__ = ['flops', 'model_size']
+__all__ = ['flops', 'model_size', 'LatencyEvaluator', 'TableLatencyEvaluator']
diff --git a/paddleslim/analysis/flops.py b/paddleslim/analysis/flops.py
index 583c8e6ebf1a41f95c5ca8aeab0a1297cd798948..b9f19926a4ce43fcdccf3afd2e00f8d0bbf31d4d 100644
--- a/paddleslim/analysis/flops.py
+++ b/paddleslim/analysis/flops.py
@@ -18,33 +18,37 @@ from ..core import GraphWrapper
 __all__ = ["flops"]
 
 
-def flops(program):
+def flops(program, detail=False):
     """
     Get FLOPS of target graph.
     Args:
         program(Program): The program used to calculate FLOPS.
     """
     graph = GraphWrapper(program)
-    return _graph_flops(graph)
+    return _graph_flops(graph, detail=detail)
 
 
-def _graph_flops(graph, only_conv=False):
+def _graph_flops(graph, only_conv=False, detail=False):
     assert isinstance(graph, GraphWrapper)
     flops = 0
+    params2flops = {}
     for op in graph.ops():
         if op.type() in ['conv2d', 'depthwise_conv2d']:
             filter_shape = op.inputs("Filter")[0].shape()
             input_shape = op.inputs("Input")[0].shape()
             output_shape = op.outputs("Output")[0].shape()
-            c_out, c_in, k_h, k_w = filter_shape
+            _, c_in, _, _ = input_shape
+            c_out, _, k_h, k_w = filter_shape
             _, _, h_out, w_out = output_shape
             groups = op.attr("groups")
-            kernel_ops = k_h * k_w * (c_in / groups)
+            kernel_ops = k_h * k_w * (float(c_in) / groups)
             if len(op.inputs("Bias")) > 0:
                 with_bias = 1
             else:
                 with_bias = 0
-            flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
+            op_flops = 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
+            flops += op_flops
+            params2flops[op.inputs("Filter")[0].name()] = op_flops
         elif op.type() == 'pool2d' and not only_conv:
             input_shape = op.inputs("X")[0].shape()
             output_shape = op.outputs("Out")[0].shape()
@@ -65,4 +69,7 @@ def _graph_flops(graph, only_conv=False):
                 input_shape[0] = 1
             flops += np.product(input_shape)
 
-    return flops
+    if detail:
+        return flops, params2flops
+    else:
+        return flops
diff --git a/paddleslim/analysis/latency.py b/paddleslim/analysis/latency.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9df492186c7473e259c17c38d6d1351f5588d3
--- /dev/null
+++ b/paddleslim/analysis/latency.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["LatencyEvaluator", "TableLatencyEvaluator"]
+
+
+class LatencyEvaluator(object):
+    def __init__(self):
+        pass
+
+    def latency(self, graph):
+        pass
+
+    def _get_ops_from_graph(self, graph):
+        assert isinstance(graph, GraphWrapper)
+        ops = []
+        i = 0
+        for op in graph.ops():
+            if op.type() in ['conv2d', 'depthwise_conv2d']:
+                tmp = _conv_op_args(op)
+            elif op.type() in [
+                    'elementwise_add', 'elementwise_mul', 'elementwise_max'
+            ]:
+                tmp = _eltwise_op_args(op)
+            elif op.type() in [
+                    'relu', 'prelu', 'sigmoid', 'relu6', 'elu', 'brelu',
+                    'leaky_relu'
+            ]:
+                tmp = _activation_op_args(op)
+            elif op.type() == 'batch_norm':
+                tmp = _batch_norm_op_args(op)
+            elif op.type() == 'pool2d':
+                tmp = _pooling_op_args(op)
+            elif op.type() == 'batch_norm':
+                tmp = _batch_norm_op_args(op)
+            elif op.type() == 'softmax':
+                tmp = _softmax_op_args(op)
+            elif op.type() == 'mul':
+                tmp = _fc_op_args(op)
+            else:
+                tmp = None
+            if tmp:
+                ops.append(tmp)
+        return ops
+
+    def _conv_op_args(op):
+        assert isinstance(op, OpWrapper)
+        tmp, res = [], []
+        # op_name
+        tmp.append('conv')
+        # flag_bias
+        if len(op.inputs('Bias')) == 0:
+            tmp.append(0)
+        else:
+            tmp.append(1)
+        # flag_relu
+        tmp.append(int(op.attr('fuse_relu')))
+        # batch size
+        tmp.append(1)
+        # channels, height, width
+        in_shapes = op.inputs('Input')[0].shape
+        tmp = tmp + [int(in_shapes[1]), int(in_shapes[2]), int(in_shapes[3])]
+
+        # output channels
+        w_shapes = op.inputs('Filter')[0].shape
+        tmp.append(int(w_shapes[0]))
+
+        # group
+        tmp.append(int(op.attr('groups')))
+
+        # kernel size
+        tmp.append(int(w_shapes[2]))
+        if w_shapes[2] != w_shapes[3]:
+            res.append(int(w_shapes[3]))
+
+        # padding
+        paddings = op.attr('paddings')
+        tmp.append(int(paddings[0]))
+        if paddings[0] != paddings[1]:
+            res.append(int(paddings[0]))
+
+        # strides
+        strides = op.attr('strides')
+        tmp.append(int(strides[0]))
+        if strides[0] != strides[1]:
+            res.append(int(strides[1]))
+
+        # dilations
+        dilations = op.attr('dilations')
+        tmp.append(int(dilations[0]))
+        if dilations[0] != dilations[1]:
+            res.append(int(dilations[1]))
+        tmp = tmp + res
+        return tmp
+
+    def _batch_norm_op_args(op):
+        tmp = []
+        # op name
+        tmp.append('batch_norm')
+        # activation type
+        if not op.attr('fuse_with_relu'):
+            tmp.append('None')
+        else:
+            tmp.append('relu')
+        # batch size
+        tmp.append(1)
+        # input channels, height, width
+        in_shapes = op.inputs("X")[0].shape
+        tmp = tmp + [int(in_shapes[1]), int(in_shapes[2]), int(in_shapes[3])]
+        return tmp
+
+    def _eltwise_op_args(op):
+        # op name
+        tmp = ['eltwise']
+        # elementwise type, TODO: add more ops
+        if op.type() == 'elementwise_mul':
+            tmp.append(1)
+        elif op.type() == 'elementwise_add':
+            tmp.append(2)
+        else:
+            tmp.append(3)
+        # batch size
+        tmp.append(1)
+        # input channels, height, width 
+        in_shapes = op.inputs('X')[0].shape
+        while len(in_shapes) < 4:
+            in_shapes = in_shapes + (1, )
+
+        for i in range(1, len(in_shapes)):
+            tmp.append(int(in_shapes[i]))
+        return tmp
+
+    def _activation_op_args(op):
+        tmp = []
+        # activation type
+        tmp.append(op.type())
+        # batch size
+        tmp.append(1)
+        # input channels, height, width
+        in_shapes = op.inputs('X')[0].shape
+        while len(in_shapes) < 4:
+            in_shapes = in_shapes + (1, )
+
+        for i in range(1, len(in_shapes)):
+            tmp.append(int(in_shapes[i]))
+        return tmp
+
+    def _pooling_op_args(op):
+        tmp, res = [], []
+        # op name
+        tmp.append('pooling')
+        # global pooling
+        tmp.append(int(op.attr('global_pooling')))
+        # batch size
+        tmp.append(1)
+        # channels, height, width
+        in_shapes = op.inputs('X')[0].shape
+        tmp = tmp + [int(in_shapes[1]), int(in_shapes[2]), int(in_shapes[3])]
+        # kernel size
+        ksize = op.attr('ksize')
+        tmp.append(int(ksize[0]))
+        if ksize[0] != ksize[1]:
+            res.append(int(ksize[1]))
+
+        # padding
+        paddings = op.attr('paddings')
+        tmp.append(int(paddings[0]))
+        if paddings[0] != paddings[1]:
+            res.append(int(paddings[1]))
+
+        # stride
+        strides = op.attr('strides')
+        tmp.append(int(strides[0]))
+        if strides[0] != strides[1]:
+            res.append(int(strides[1]))
+
+        # ceil mode
+        tmp.append(int(op.attr('ceil_mode')))
+
+        # pool type
+        pool_type = op.attr('pooling_type')
+        exclusive = op.attr('exclusive')
+        if pool_type == 'max' and (not exclusive):
+            tmp.append(1)
+        elif pool_type == 'avg' and (not exclusive):
+            tmp.append(2)
+        else:
+            tmp.append(3)
+
+        tmp = tmp + res
+        return tmp
+
+    def _softmax_op_args(op):
+        # op name
+        tmp = ['softmax']
+        # axis
+        tmp.append(op.attr('axis'))
+        # batch size
+        tmp.append(1)
+        # input channels, height, width
+        in_shapes = op.inputs('X')[0].shape
+        while len(in_shapes) < 4:
+            in_shapes = in_shapes + (1, )
+
+        for i in range(1, len(in_shapes)):
+            tmp.append(int(in_shapes[i]))
+
+        return tmp
+
+    def _fc_op_args(blocks, op):
+        # op name
+        tmp = ['conv']
+        # flag bias
+        tmp.append(0)
+        # flag relu
+        tmp.append(0)
+        # batch size 
+        tmp.append(1)
+        # input channels, height, width
+        channels = 1
+        in_shape = op.inputs('X')[0].shape
+        for i in range(1, len(in_shape)):
+            channels *= in_shape[i]
+        tmp = tmp + [int(channels), 1, 1]
+        # output channels
+        tmp.append(int(op.outputs('Out')[0].shape[1]))
+        # groups, kernel size, padding, stride, dilation
+        tmp = tmp + [1, 1, 0, 1, 1]
+        return tmp
+
+
+class TableLatencyEvaluator(LatencyEvaluator):
+    def __init__(self, table_file, delimiter=","):
+        """
+        The evaluator used to get graph's latency on some devices and infer engines.
+        Args:
+          - table_file(str): The path of file that records the devices latency of operators.
+          - delimiter(str): The delimiter used in `table_file`.
+        """
+        self._table = self._load_table(table_file)
+        self._delimiter = delimiter
+
+    def _load_table(self, table_file):
+        table = {}
+        with open(table_file) as f:
+            line = f.readline()
+            self.infer_engine_name, self.device_name, self.create_time = line.strip(
+            ).split("\t")
+            for line in f:
+                op_str, latency = line.strip().split("\t")
+                table[op_str] = float(latency)
+        return table
+
+    def _op_latency(self, op_str):
+        assert op_str in self._table
+        return self._table[op_str]
+
+    def latency(self, graph):
+        """
+        Get latency of target graph.
+        Args:
+            - graph(GrapWrapper | Program): The graph to be evaluated.
+        Returns:
+            latency(float): The latency of given graph on current evaluator.
+        """
+        total_latency = 0
+        if isinstance(graph, Program):
+            graph = GraphWrapper(graph)
+        assert isinstance(graph, GraphWrapper)
+        for op in self._get_ops_from_graph(graph):
+            total_latency += self._op_latency(self._delimiter.join(op))
+        return total_latency
diff --git a/paddleslim/analysis/table_latency.md b/paddleslim/analysis/table_latency.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d95077c65ffc16a03292620909c1aff27d58a43
--- /dev/null
+++ b/paddleslim/analysis/table_latency.md
@@ -0,0 +1,147 @@
+# 硬件延时评估表
+
+硬件延时评估表用于快速评估一个模型在特定硬件环境和推理引擎上的推理速度。
+该文档主要用于定义PaddleSlim支持的硬件延时评估表的格式。
+
+## 概述
+
+硬件延时评估表中存放着所有可能的操作对应的延时信息，该表中的一个操作包括操作类型和操作参数，比如：操作类型可以是`conv2d`，对应的操作参数有输入特征图的大小、卷积核个数、卷积核大小等。
+给定操作的延时依赖于硬件环境和推理引擎。
+
+## 整体格式
+
+硬件延时评估表以文件或多行字符串的形式保存。
+
+硬件延时评估表第一行保存版本信息，后续每行为一个操作和对应的延时信息。
+
+## 版本信息
+
+版本信息以英文字符逗号分割，内容依次为硬件环境名称、推理引擎名称和时间戳。
+
+- **硬件环境名称：** 用于标识硬件环境，可以包含计算架构类型、版本号等信息。
+
+- **推理引擎名称：** 用于标识推理引擎，可以包含推理引擎名称、版本号、优化选项等信息。
+
+- **时间戳：** 该评估表的创建时间。
+
+## 操作信息
+
+操作信息字段之间以逗号分割。操作信息与延迟信息之间以制表符分割。
+
+### conv2d
+
+**格式**
+
+```
+op_type,flag_bias,flag_relu,n_in,c_in,h_in,w_in,c_out,groups,kernel,padding,stride,dilation\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **flag_bias (int)** - 是否有 bias（0：无，1：有）。
+- **flag_relu (int)** - 是否有 relu（0：无，1：有）。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **c_out (int)** - 输出 Tensor 的通道 (channel) 数。
+- **groups (int)** - 卷积二维层（Conv2D Layer）的组数。
+- **kernel (int)** - 卷积核大小。
+- **padding (int)** - 填充 (padding) 大小。
+- **stride (int)** - 步长 (stride) 大小。
+- **dilation (int)** - 膨胀 (dilation) 大小。
+- **latency (float)** - 当前op的延时时间
+
+### activation
+
+**格式**
+
+```
+op_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **latency (float)** - 当前op的延时时间
+
+### batch_norm
+
+**格式**
+
+```
+op_type,active_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **active_type (string)** - 激活函数类型，包含：relu, prelu, sigmoid, relu6, tanh。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **latency (float)** - 当前op的延时时间
+
+### eltwise
+
+**格式**
+
+```
+op_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **latency (float)** - 当前op的延时时间
+
+### pooling
+
+**格式**
+
+```
+op_type,flag_global_pooling,n_in,c_in,h_in,w_in,kernel,padding,stride,ceil_mode,pool_type\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **flag_global_pooling (int)** - 是否为全局池化（0：不是，1：是）。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **kernel (int)** - 卷积核大小。
+- **padding (int)** - 填充 (padding) 大小。
+- **stride (int)** - 步长 (stride) 大小。
+- **ceil_mode (int)** - 是否用 ceil 函数计算输出高度和宽度。0 表示使用 floor 函数，1 表示使用 ceil 函数。
+- **pool_type (int)** - 池化类型，其中 1 表示 pooling_max，2 表示 pooling_average_include_padding，3 表示 pooling_average_exclude_padding。
+- **latency (float)** - 当前op的延时时间
+
+### softmax
+
+**格式**
+
+```
+op_type,axis,n_in,c_in,h_in,w_in\tlatency
+```
+
+**字段解释**
+
+- **op_type(str)** - 当前op类型。
+- **axis (int)** - 执行 softmax 计算的维度索引，应该在 [−1，rank − 1] 范围内，其中 rank 是输入变量的秩。
+- **n_in (int)** - 输入 Tensor 的批尺寸 (batch size)。
+- **c_in (int)** - 输入 Tensor 的通道 (channel) 数。
+- **h_in (int)** - 输入 Tensor 的特征高度。
+- **w_in (int)** - 输入 Tensor 的特征宽度。
+- **latency (float)** - 当前op的延时时间
diff --git a/paddleslim/common/sa_controller.py b/paddleslim/common/sa_controller.py
index 176cb8332deb8512528001ef6a7287f07862c130..210721eaaa623b259f4d4bb4affcadb4f2f067d4 100644
--- a/paddleslim/common/sa_controller.py
+++ b/paddleslim/common/sa_controller.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 """The controller used to search hyperparameters or neural architecture"""
 
+import os
 import copy
 import math
 import logging
 import numpy as np
+import json
 from .controller import EvolutionaryController
 from .log_helper import get_logger
 
@@ -34,15 +36,25 @@ class SAController(EvolutionaryController):
                  init_temperature=1024,
                  max_try_times=None,
                  init_tokens=None,
-                 constrain_func=None):
+                 reward=-1,
+                 max_reward=-1,
+                 iters=0,
+                 best_tokens=None,
+                 constrain_func=None,
+                 checkpoints=None):
         """Initialize.
         Args:
             range_table(list<int>): Range table.
             reduce_rate(float): The decay rate of temperature.
             init_temperature(float): Init temperature.
             max_try_times(int): max try times before get legal tokens.
-            init_tokens(list<int>): The initial tokens.
+            init_tokens(list<int>): The initial tokens. Default: None.
+            reward(float): The reward of current tokens. Default: -1.
+            max_reward(float): The max reward in the search of sanas, in general, best tokens get max reward. Default: -1.
+            iters(int): The iteration of sa controller. Default: 0.
+            best_tokens(list<int>): The best tokens in the search of sanas, in general, best tokens get max reward. Default: None.
             constrain_func(function): The callback function used to check whether the tokens meet constraint. None means there is no constraint. Default: None.
+            checkpoints(str): if checkpoint is None, donnot save checkpoints, else save scene to checkpoints file.
         """
         super(SAController, self).__init__()
         self._range_table = range_table
@@ -51,12 +63,13 @@ class SAController(EvolutionaryController):
         self._reduce_rate = reduce_rate
         self._init_temperature = init_temperature
         self._max_try_times = max_try_times
-        self._reward = -1
+        self._reward = reward
         self._tokens = init_tokens
         self._constrain_func = constrain_func
-        self._max_reward = -1
-        self._best_tokens = None
-        self._iter = 0
+        self._max_reward = max_reward
+        self._best_tokens = best_tokens
+        self._iter = iters
+        self._checkpoints = checkpoints
 
     def __getstate__(self):
         d = {}
@@ -84,8 +97,11 @@ class SAController(EvolutionaryController):
             self._max_reward = reward
             self._best_tokens = tokens
         _logger.info(
-            "Controller - iter: {}; current_reward: {}; current tokens: {}".
-            format(self._iter, self._reward, self._tokens))
+            "Controller - iter: {}; best_reward: {}, best tokens: {}, current_reward: {}; current tokens: {}".
+            format(self._iter, self._reward, self._tokens, reward, tokens))
+
+        if self._checkpoints != None:
+            self._save_checkpoint(self._checkpoints)
 
     def next_tokens(self, control_token=None):
         """
@@ -108,8 +124,19 @@ class SAController(EvolutionaryController):
                 index = int(len(self._range_table[0]) * np.random.random())
                 new_tokens = tokens[:]
                 new_tokens[index] = np.random.randint(
-                    self._range_table[0][index],
-                    self._range_table[1][index] + 1)
+                    self._range_table[0][index], self._range_table[1][index])
             else:
                 break
         return new_tokens
+
+    def _save_checkpoint(self, output_dir):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        file_path = os.path.join(output_dir, 'sanas.checkpoints')
+        scene = dict()
+        for key in self.__dict__:
+            if key in ['_checkpoints']:
+                continue
+            scene[key] = self.__dict__[key]
+        with open(file_path, 'w') as f:
+            json.dump(scene, f)
diff --git a/paddleslim/dist/single_distiller.py b/paddleslim/dist/single_distiller.py
index 70b843c90fec6bdf906045dbac3097f8dfba3ff1..8f5dcaeb14a0f6a7aadd5c99de7bc3c144f21414 100644
--- a/paddleslim/dist/single_distiller.py
+++ b/paddleslim/dist/single_distiller.py
@@ -20,8 +20,7 @@ def merge(teacher_program,
           student_program,
           data_name_map,
           place,
-          teacher_scope=fluid.global_scope(),
-          student_scope=fluid.global_scope(),
+          scope=fluid.global_scope(),
           name_prefix='teacher_'):
     """
     Merge teacher program into student program and add a uniform prefix to the
@@ -33,8 +32,7 @@ def merge(teacher_program,
                             and the student var name
         place(fluid.CPUPlace()|fluid.CUDAPlace(N)): This parameter represents
                                                     paddle run on which device.
-        student_scope(Scope): The input student scope 
-        teacher_scope(Scope): The input teacher scope
+        scope(Scope): The input scope
         name_prefix(str): Name prefix added for all vars of the teacher program.
     Return(Program): Merged program.
     """
@@ -50,9 +48,9 @@ def merge(teacher_program,
                 new_name = name_prefix + teacher_var.name
             if not skip_rename:
                 # scope var rename
-                scope_var = teacher_scope.var(teacher_var.name).get_tensor()
-                renamed_scope_var = teacher_scope.var(new_name).get_tensor()
-                renamed_scope_var.set(np.array(scope_var), place)
+                old_var = scope.var(teacher_var.name).get_tensor()
+                renamed_var = scope.var(new_name).get_tensor()
+                renamed_var.set(np.array(old_var), place)
     
                 # program var rename
                 renamed_var = teacher_program.global_block()._rename_var(
@@ -60,11 +58,6 @@ def merge(teacher_program,
 
     for teacher_var in teacher_program.list_vars():
         if teacher_var.name != 'fetch' and teacher_var.name != 'feed':
-            # student scope add var
-            student_scope_var = student_scope.var(teacher_var.name).get_tensor()
-            teacher_scope_var = teacher_scope.var(teacher_var.name).get_tensor()
-            student_scope_var.set(np.array(teacher_scope_var), place)
-
             # student program add var
             new_var = student_program.global_block()._clone_variable(
                 teacher_var, force_persistable=False)
diff --git a/paddleslim/dist/single_distiller_api_doc.md b/paddleslim/dist/single_distiller_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..e166f922c50066e35413e2af3b3fa6c19533c42c
--- /dev/null
+++ b/paddleslim/dist/single_distiller_api_doc.md
@@ -0,0 +1,206 @@
+# paddleslim.dist API文档
+
+## merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_')
+
+该方法将两个fluid program（teacher_program, student_program）融合为一个program，并将融合得到的program返回。在融合的program中，可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数，从而达到用teacher模型的暗知识（Dark Knowledge）指导student模型学习的目的。
+
+**参数：**
+
+- **teacher_program**(Program)-定义了teacher模型的paddle program
+- **student_program**(Program)-定义了student模型的paddle program
+- **data_name_map**(dict)-teacher输入接口名与student输入接口名的映射，key为teacher的输入名，value为student的输入名。merge函数将会把这两个模型的输入按对应关系合并在一起，保证teacher与student输入数据相同
+- **place**(fluid.CPUPlace()|fluid.CUDAPlace(N))-该参数表示程序运行在何种设备上，这里的N为GPU对应的ID
+- **scope**(Scope)-该参数表示teacher variables和student variables所使用的作用域，如果不指定将使用默认的全局作用域。默认值：fluid.global_scope()
+- **name_prefix**(str)-为了避免teacher variables和student variables存在同名变量而引起命名冲突，merge函数将统一为teacher variables添加一个名称前缀name_prefix，merge后的program中所有teacher variables都将带有这一名称前缀。默认值：'teacher_'
+
+**返回：**由student_program和teacher_program merge得到的program
+
+**使用示例：**
+
+```python
+import paddle.fluid as fluid
+import paddleslim.dist as dist
+student_program = fluid.Program()
+with fluid.program_guard(student_program):
+    x = fluid.layers.data(name='x', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(x, 32, 1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1)
+teacher_program = fluid.Program()
+with fluid.program_guard(teacher_program):
+    y = fluid.layers.data(name='y', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(y, 32, 1)
+    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1)
+data_name_map = {'y':'x'}
+USE_GPU = False
+place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+main_program = dist.merge(teacher_program, student_program, data_name_map, place)
+```
+
+
+
+## fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=fluid.default_main_program())
+
+fsp_loss为program内的teacher var和student var添加fsp loss，出自论文[A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning](http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf)
+
+**参数：**
+
+- **teacher_var1_name**(str): teacher_var1的名称. 对应的variable是一个形为`[batch_size, x_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64
+- **teacher_var2_name**(str): teacher_var2的名称. 对应的variable是一个形为`[batch_size, y_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与teacher_var1的x_channel不同，其他维度必须与teacher_var1相同
+- **student_var1_name**(str): student_var1的名称. 对应的variable需与teacher_var1尺寸保持一致，是一个形为`[batch_size, x_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64
+- **student_var2_name**(str): student_var2的名称. 对应的variable需与teacher_var2尺寸保持一致，是一个形为`[batch_size, y_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与student_var1的x_channel不同，其他维度必须与student_var1相同
+- **program**(Program): 用于蒸馏训练的fluid program。默认值：fluid.default_main_program()
+
+**返回：**由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp_loss
+
+**使用示例：**
+
+```python
+import paddle.fluid as fluid
+import paddleslim.dist as dist
+student_program = fluid.Program()
+with fluid.program_guard(student_program):
+    x = fluid.layers.data(name='x', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+teacher_program = fluid.Program()
+with fluid.program_guard(teacher_program):
+    y = fluid.layers.data(name='y', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+data_name_map = {'y':'x'}
+USE_GPU = False
+place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+main_program = merge(teacher_program, student_program, data_name_map, place)
+with fluid.program_guard(main_program):
+    distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1', 's1.tmp_1', 's2.tmp_1', main_program)
+```
+
+
+
+## l2_loss(teacher_var_name, student_var_name, program=fluid.default_main_program())
+
+l2_loss为program内的teacher var和student var添加l2 loss
+
+**参数：**
+
+- **teacher_var_name**(str): teacher_var的名称. 
+- **student_var_name**(str): student_var的名称.
+- **program**(Program): 用于蒸馏训练的fluid program。默认值：fluid.default_main_program()
+
+**返回：**由teacher_var, student_var组合得到的l2_loss
+
+**使用示例：**
+
+```python
+import paddle.fluid as fluid
+import paddleslim.dist as dist
+student_program = fluid.Program()
+with fluid.program_guard(student_program):
+    x = fluid.layers.data(name='x', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+teacher_program = fluid.Program()
+with fluid.program_guard(teacher_program):
+    y = fluid.layers.data(name='y', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+data_name_map = {'y':'x'}
+USE_GPU = False
+place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+main_program = merge(teacher_program, student_program, data_name_map, place)
+with fluid.program_guard(main_program):
+    distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1', main_program)
+```
+
+
+
+## soft_label_loss(teacher_var_name, student_var_name, program=fluid.default_main_program(), teacher_temperature=1., student_temperature=1.)
+
+soft_label_loss为program内的teacher var和student var添加soft label loss，出自论文[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)
+
+**参数：**
+
+- **teacher_var_name**(str): teacher_var的名称. 
+- **student_var_name**(str): student_var的名称. 
+- **program**(Program): 用于蒸馏训练的fluid program。默认值：fluid.default_main_program()
+- **teacher_temperature**(float): 对teacher_var进行soft操作的温度值，温度值越大得到的特征图越平滑 
+- **student_temperature**(float): 对student_var进行soft操作的温度值，温度值越大得到的特征图越平滑 
+
+**返回：**由teacher_var, student_var组合得到的soft_label_loss
+
+**使用示例：**
+
+```python
+import paddle.fluid as fluid
+import paddleslim.dist as dist
+student_program = fluid.Program()
+with fluid.program_guard(student_program):
+    x = fluid.layers.data(name='x', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+teacher_program = fluid.Program()
+with fluid.program_guard(teacher_program):
+    y = fluid.layers.data(name='y', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+data_name_map = {'y':'x'}
+USE_GPU = False
+place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+main_program = merge(teacher_program, student_program, data_name_map, place)
+with fluid.program_guard(main_program):
+    distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1', 's2.tmp_1', main_program, 1., 1.)
+```
+
+
+
+## loss(loss_func, program=fluid.default_main_program(), **kwargs)
+
+loss函数支持对任意多对teacher_var和student_var使用自定义损失函数
+
+**参数：**
+
+- **loss_func**(python function): 自定义的损失函数，输入为teacher var和student var，输出为自定义的loss 
+- **program**(Program): 用于蒸馏训练的fluid program。默认值：fluid.default_main_program()
+- **\**kwargs**: loss_func输入名与对应variable名称
+
+**返回**：自定义的损失函数loss
+
+**使用示例：**
+
+```python
+import paddle.fluid as fluid
+import paddleslim.dist as dist
+student_program = fluid.Program()
+with fluid.program_guard(student_program):
+    x = fluid.layers.data(name='x', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+teacher_program = fluid.Program()
+with fluid.program_guard(teacher_program):
+    y = fluid.layers.data(name='y', shape=[1, 28, 28])
+    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+data_name_map = {'y':'x'}
+USE_GPU = False
+place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+main_program = merge(teacher_program, student_program, data_name_map, place)
+def adaptation_loss(t_var, s_var):
+    teacher_channel = t_var.shape[1]
+    s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
+    hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var))
+    return hint_loss
+with fluid.program_guard(main_program):
+    distillation_loss = dist.loss(main_program, adaptation_loss, t_var='teacher_t2.tmp_1', s_var='s2.tmp_1')
+```
+
+## 注意事项
+
+在添加蒸馏loss时会引入新的variable，需要注意新引入的variable不要与student variables命名冲突。这里建议两种用法：
+
+1. 建议与student_program使用同一个命名空间，以避免一些未指定名称的variables(例如tmp_0, tmp_1...)多次定义为同一名称出现命名冲突
+2. 建议在添加蒸馏loss时指定一个命名空间前缀，具体用法请参考Paddle官方文档[fluid.name_scope](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/name_scope_cn.html#name-scope)
diff --git a/paddleslim/nas/sa_nas.py b/paddleslim/nas/sa_nas.py
index d9c5dcf917f734b622bc5515f866d62ee79957de..7eced9c546a57ad49625a707b06e422426c19e92 100644
--- a/paddleslim/nas/sa_nas.py
+++ b/paddleslim/nas/sa_nas.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import socket
 import logging
 import numpy as np
+import json
 import hashlib
 import paddle.fluid as fluid
 from ..core import VarWrapper, OpWrapper, GraphWrapper
@@ -39,6 +41,8 @@ class SANAS(object):
                  reduce_rate=0.85,
                  search_steps=300,
                  key="sa_nas",
+                 save_checkpoint='nas_checkpoint',
+                 load_checkpoint=None,
                  is_server=False):
         """
         Search a group of ratios used to prune program.
@@ -51,6 +55,8 @@ class SANAS(object):
             reduce_rate(float): The decay rate used in simulated annealing search strategy.
             search_steps(int): The steps of searching.
             key(str): Identity used in communication between controller server and clients.
+            save_checkpoint(string|None): The directory of checkpoint to save, if set to None, not save checkpoint. Default: 'nas_checkpoint'.
+            load_checkpoint(string|None): The directory of checkpoint to load, if set to None, not load checkpoint. Default: None.
             is_server(bool): Whether current host is controller server. Default: True.
         """
         if not is_server:
@@ -75,13 +81,39 @@ class SANAS(object):
             range_table = self._search_space.range_table()
             range_table = (len(range_table) * [0], range_table)
             _logger.info("range table: {}".format(range_table))
+
+            if load_checkpoint != None:
+                assert os.path.exists(
+                    load_checkpoint
+                ) == True, 'load checkpoint file NOT EXIST!!! Please check the directory of checkpoint!!!'
+                checkpoint_path = os.path.join(load_checkpoint,
+                                               'sanas.checkpoints')
+                with open(checkpoint_path, 'r') as f:
+                    scene = json.load(f)
+                preinit_tokens = scene['_tokens']
+                prereward = scene['_reward']
+                premax_reward = scene['_max_reward']
+                prebest_tokens = scene['_best_tokens']
+                preiter = scene['_iter']
+            else:
+                preinit_tokens = init_tokens
+                prereward = -1
+                premax_reward = -1
+                prebest_tokens = None
+                preiter = 0
+
             controller = SAController(
                 range_table,
                 self._reduce_rate,
                 self._init_temperature,
                 max_try_times=None,
-                init_tokens=init_tokens,
-                constrain_func=None)
+                init_tokens=preinit_tokens,
+                reward=prereward,
+                max_reward=premax_reward,
+                iters=preiter,
+                best_tokens=prebest_tokens,
+                constrain_func=None,
+                checkpoints=save_checkpoint)
 
             max_client_num = 100
             self._controller_server = ControllerServer(
@@ -96,13 +128,16 @@ class SANAS(object):
         self._controller_client = ControllerClient(
             server_ip, server_port, key=self._key)
 
-        self._iter = 0
+        if is_server and load_checkpoint != None:
+            self._iter = scene['_iter']
+        else:
+            self._iter = 0
 
     def _get_host_ip(self):
         return socket.gethostbyname(socket.gethostname())
 
     def tokens2arch(self, tokens):
-        return self._search_space.token2arch(self.tokens)
+        return self._search_space.token2arch(tokens)
 
     def next_archs(self):
         """
diff --git a/paddleslim/prune/pruner.py b/paddleslim/prune/pruner.py
index e2b6a7e1d28078abef97c5fa53b215b098f18cca..95f6774ce5a36b8a6aa05fd6f989f0cb23f2339c 100644
--- a/paddleslim/prune/pruner.py
+++ b/paddleslim/prune/pruner.py
@@ -41,8 +41,8 @@ class Pruner():
               place=None,
               lazy=False,
               only_graph=False,
-              param_backup=None,
-              param_shape_backup=None):
+              param_backup=False,
+              param_shape_backup=False):
         """
         Pruning the given parameters.
         Args:
@@ -55,14 +55,18 @@ class Pruner():
                         False means cutting down the pruned elements. Default: False.
             only_graph(bool): True means only modifying the graph.
                               False means modifying graph and variables in scope. Default: False.
-            param_backup(dict): A dict to backup the values of parameters. Default: None.
-            param_shape_backup(dict): A dict to backup the shapes of parameters. Default: None.
+            param_backup(bool): Whether to return a dict to backup the values of parameters. Default: False.
+            param_shape_backup(bool): Whether to return a dict to backup the shapes of parameters. Default: False.
         Returns:
             Program: The pruned program.
+            param_backup: A dict to backup the values of parameters.
+            param_shape_backup: A dict to backup the shapes of parameters.
         """
 
         self.pruned_list = []
         graph = GraphWrapper(program.clone())
+        param_backup = {} if param_backup else None
+        param_shape_backup = {} if param_shape_backup else None
         self._prune_parameters(
             graph,
             scope,
@@ -77,7 +81,7 @@ class Pruner():
             if op.type() == 'depthwise_conv2d' or op.type(
             ) == 'depthwise_conv2d_grad':
                 op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
-        return graph.program
+        return graph.program, param_backup, param_shape_backup
 
     def _prune_filters_by_ratio(self,
                                 scope,
@@ -130,8 +134,16 @@ class Pruner():
                         param.name() not in param_backup):
                     param_backup[param.name()] = copy.deepcopy(
                         np.array(param_t))
-                pruned_param = self._prune_tensor(
-                    np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
+                try:
+                    pruned_param = self._prune_tensor(
+                        np.array(param_t),
+                        pruned_idx,
+                        pruned_axis=0,
+                        lazy=lazy)
+                except IndexError as e:
+                    _logger.error("Pruning {}, but get [{}]".format(param.name(
+                    ), e))
+
                 param_t.set(pruned_param, place)
                 ori_shape = param.shape()
                 if param_shape_backup is not None and (
@@ -171,7 +183,6 @@ class Pruner():
         """
         if params[0].name() in self.pruned_list[pruned_axis]:
             return
-
         if only_graph:
             pruned_num = len(pruned_idx)
             for param in params:
@@ -210,40 +221,55 @@ class Pruner():
                 ), ori_shape, new_shape))
                 self.pruned_list[pruned_axis].append(param.name())
 
-    def _forward_search_related_op(self, graph, param):
+    def _forward_search_related_op(self, graph, node):
         """
         Forward search operators that will be affected by pruning of param.
         Args:
             graph(GraphWrapper): The graph to be searched.
-            param(VarWrapper): The current pruned parameter.
+            node(VarWrapper|OpWrapper): The current pruned parameter or operator.
         Returns:
             list<OpWrapper>: A list of operators.
         """
-        assert isinstance(param, VarWrapper)
         visited = {}
         for op in graph.ops():
             visited[op.idx()] = False
         stack = []
-        for op in graph.ops():
-            if (not op.is_bwd_op()) and (param in op.all_inputs()):
-                stack.append(op)
         visit_path = []
+        if isinstance(node, VarWrapper):
+            for op in graph.ops():
+                if (not op.is_bwd_op()) and (node in op.all_inputs()):
+                    next_ops = self._get_next_unvisited_op(graph, visited, op)
+                    #                visit_path.append(op)
+                    visited[op.idx()] = True
+                    for next_op in next_ops:
+                        if visited[next_op.idx()] == False:
+                            stack.append(next_op)
+                            visit_path.append(next_op)
+                            visited[next_op.idx()] = True
+        elif isinstance(node, OpWrapper):
+            next_ops = self._get_next_unvisited_op(graph, visited, node)
+            for next_op in next_ops:
+                if visited[next_op.idx()] == False:
+                    stack.append(next_op)
+                    visit_path.append(next_op)
+                    visited[next_op.idx()] = True
         while len(stack) > 0:
-            top_op = stack[len(stack) - 1]
-            if visited[top_op.idx()] == False:
-                visit_path.append(top_op)
-                visited[top_op.idx()] = True
+            #top_op = stack[len(stack) - 1]
+            top_op = stack.pop(0)
             next_ops = None
-            if top_op.type() == "conv2d" and param not in top_op.all_inputs():
+            if top_op.type() in ["conv2d", "deformable_conv"]:
                 next_ops = None
-            elif top_op.type() == "mul":
+            elif top_op.type() in ["mul", "concat"]:
                 next_ops = None
             else:
                 next_ops = self._get_next_unvisited_op(graph, visited, top_op)
-            if next_ops == None:
-                stack.pop()
-            else:
-                stack += next_ops
+            if next_ops != None:
+                for op in next_ops:
+                    if visited[op.idx()] == False:
+                        stack.append(op)
+                        visit_path.append(op)
+                        visited[op.idx()] = True
+
         return visit_path
 
     def _get_next_unvisited_op(self, graph, visited, top_op):
@@ -261,7 +287,7 @@ class Pruner():
         for op in graph.next_ops(top_op):
             if (visited[op.idx()] == False) and (not op.is_bwd_op()):
                 next_ops.append(op)
-        return next_ops if len(next_ops) > 0 else None
+        return next_ops
 
     def _get_accumulator(self, graph, param):
         """
@@ -317,7 +343,8 @@ class Pruner():
         if param.name() in self.pruned_list[0]:
             return
         related_ops = self._forward_search_related_op(graph, param)
-
+        for op in related_ops:
+            _logger.debug("relate op: {};".format(op))
         if ratio is None:
             assert pruned_idxs is not None
             self._prune_parameter_by_idx(
@@ -339,17 +366,20 @@ class Pruner():
                 only_graph=only_graph,
                 param_backup=param_backup,
                 param_shape_backup=param_shape_backup)
-        corrected_idxs = pruned_idxs[:]
+        self._prune_ops(related_ops, pruned_idxs, graph, scope, place, lazy,
+                        only_graph, param_backup, param_shape_backup)
 
-        for idx, op in enumerate(related_ops):
-            if op.type() == "conv2d" and (param not in op.all_inputs()):
+    def _prune_ops(self, ops, pruned_idxs, graph, scope, place, lazy,
+                   only_graph, param_backup, param_shape_backup):
+        for idx, op in enumerate(ops):
+            if op.type() in ["conv2d", "deformable_conv"]:
                 for in_var in op.all_inputs():
                     if graph.is_parameter(in_var):
                         conv_param = in_var
                         self._prune_parameter_by_idx(
                             scope, [conv_param] + self._get_accumulator(
                                 graph, conv_param),
-                            corrected_idxs,
+                            pruned_idxs,
                             pruned_axis=1,
                             place=place,
                             lazy=lazy,
@@ -363,7 +393,7 @@ class Pruner():
                         self._prune_parameter_by_idx(
                             scope, [conv_param] + self._get_accumulator(
                                 graph, conv_param),
-                            corrected_idxs,
+                            pruned_idxs,
                             pruned_axis=0,
                             place=place,
                             lazy=lazy,
@@ -397,7 +427,7 @@ class Pruner():
                 idx = []
                 feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
                 range_idx = np.array(range(feature_map_size))
-                for i in corrected_idxs:
+                for i in pruned_idxs:
                     idx += list(range_idx + i * feature_map_size)
                 corrected_idxs = idx
                 self._prune_parameter_by_idx(
@@ -412,23 +442,37 @@ class Pruner():
 
             elif op.type() == "concat":
                 concat_inputs = op.all_inputs()
-                last_op = related_ops[idx - 1]
-                for out_var in last_op.all_outputs():
-                    if out_var in concat_inputs:
-                        concat_idx = concat_inputs.index(out_var)
+                last_op = ops[idx - 1]
+                concat_idx = None
+                for last_op in reversed(ops):
+                    for out_var in last_op.all_outputs():
+                        if out_var in concat_inputs:
+                            concat_idx = concat_inputs.index(out_var)
+                            break
+                    if concat_idx is not None:
+                        break
                 offset = 0
                 for ci in range(concat_idx):
                     offset += concat_inputs[ci].shape()[1]
                 corrected_idxs = [x + offset for x in pruned_idxs]
+                related_ops = self._forward_search_related_op(graph, op)
+
+                for op in related_ops:
+                    _logger.debug("concat relate op: {};".format(op))
+
+                self._prune_ops(related_ops, corrected_idxs, graph, scope,
+                                place, lazy, only_graph, param_backup,
+                                param_shape_backup)
             elif op.type() == "batch_norm":
                 bn_inputs = op.all_inputs()
-                mean = bn_inputs[2]
+                in_num = len(bn_inputs)
+                beta = bn_inputs[0]
+                mean = bn_inputs[1]
+                alpha = bn_inputs[2]
                 variance = bn_inputs[3]
-                alpha = bn_inputs[0]
-                beta = bn_inputs[1]
                 self._prune_parameter_by_idx(
                     scope, [mean] + self._get_accumulator(graph, mean),
-                    corrected_idxs,
+                    pruned_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
@@ -437,7 +481,7 @@ class Pruner():
                     param_shape_backup=param_shape_backup)
                 self._prune_parameter_by_idx(
                     scope, [variance] + self._get_accumulator(graph, variance),
-                    corrected_idxs,
+                    pruned_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
@@ -446,7 +490,7 @@ class Pruner():
                     param_shape_backup=param_shape_backup)
                 self._prune_parameter_by_idx(
                     scope, [alpha] + self._get_accumulator(graph, alpha),
-                    corrected_idxs,
+                    pruned_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
@@ -455,7 +499,7 @@ class Pruner():
                     param_shape_backup=param_shape_backup)
                 self._prune_parameter_by_idx(
                     scope, [beta] + self._get_accumulator(graph, beta),
-                    corrected_idxs,
+                    pruned_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
@@ -491,6 +535,10 @@ class Pruner():
         self.pruned_list = [[], []]
         for param, ratio in zip(params, ratios):
             assert isinstance(param, str) or isinstance(param, unicode)
+            if param in self.pruned_list[0]:
+                _logger.info("Skip {}".format(param))
+                continue
+            _logger.info("pruning param: {}".format(param))
             param = graph.var(param)
             self._forward_pruning_ralated_params(
                 graph,
@@ -504,9 +552,10 @@ class Pruner():
                 param_shape_backup=param_shape_backup)
             ops = param.outputs()
             for op in ops:
-                if op.type() == 'conv2d':
+                if op.type() in ['conv2d', 'deformable_conv']:
                     brother_ops = self._search_brother_ops(graph, op)
                     for broher in brother_ops:
+                        _logger.debug("pruning brother: {}".format(broher))
                         for p in graph.get_param_by_op(broher):
                             self._forward_pruning_ralated_params(
                                 graph,
@@ -534,8 +583,11 @@ class Pruner():
         stack = []
         brothers = []
         for op in graph.next_ops(op_node):
-            if ("conv2d" not in op.type()) and (op.type() != 'fc') and (
-                    not op.is_bwd_op()) and (not op.is_opt_op()):
+            if ("conv2d" not in op.type()) and (
+                    "concat" not in op.type()) and (
+                        "deformable_conv" not in op.type()) and (
+                            op.type() != 'fc') and (
+                                not op.is_bwd_op()) and (not op.is_opt_op()):
                 stack.append(op)
                 visited.append(op.idx())
         while len(stack) > 0:
@@ -546,6 +598,7 @@ class Pruner():
                     _logger.debug("----------go back from {} to {}----------".
                                   format(top_op, parent))
                     if (('conv2d' in parent.type()) or
+                        ("deformable_conv" in parent.type()) or
                         (parent.type() == 'fc')):
                         brothers.append(parent)
                     else:
@@ -553,10 +606,13 @@ class Pruner():
                     visited.append(parent.idx())
 
             for child in graph.next_ops(top_op):
-                if ('conv2d' not in child.type()
-                    ) and (child.type() != 'fc') and (
-                        child.idx() not in visited) and (
-                            not child.is_bwd_op()) and (not child.is_opt_op()):
+                if ('conv2d' not in child.type()) and (
+                        "concat" not in child.type()) and (
+                            'deformable_conv' not in child.type()) and (
+                                child.type() != 'fc') and (
+                                    child.idx() not in visited) and (
+                                        not child.is_bwd_op()) and (
+                                            not child.is_opt_op()):
                     stack.append(child)
                     visited.append(child.idx())
         _logger.debug("brothers: {}".format(brothers))
diff --git a/paddleslim/prune/sensitive.py b/paddleslim/prune/sensitive.py
index 8dd2f88b7de9eba62df447c78ef629a32111cd09..3341a4a2e847bfb62b00a5caf4807df3b960ea68 100644
--- a/paddleslim/prune/sensitive.py
+++ b/paddleslim/prune/sensitive.py
@@ -33,12 +33,14 @@ def sensitivity(program,
                 param_names,
                 eval_func,
                 sensitivities_file=None,
-                step_size=0.2,
-                max_pruned_times=None):
+                pruned_ratios=None):
     scope = fluid.global_scope()
     graph = GraphWrapper(program)
     sensitivities = _load_sensitivities(sensitivities_file)
 
+    if pruned_ratios is None:
+        pruned_ratios = np.arange(0.1, 1, step=0.1)
+
     for name in param_names:
         if name not in sensitivities:
             size = graph.var(name).shape()[0]
@@ -49,25 +51,17 @@ def sensitivity(program,
             }
     baseline = None
     for name in sensitivities:
-        ratio = step_size
-        pruned_times = 0
-        while ratio < 1:
-            if max_pruned_times is not None and pruned_times >= max_pruned_times:
-                break
-            ratio = round(ratio, 2)
+        for ratio in pruned_ratios:
             if ratio in sensitivities[name]['pruned_percent']:
                 _logger.debug('{}, {} has computed.'.format(name, ratio))
-                ratio += step_size
-                pruned_times += 1
                 continue
             if baseline is None:
                 baseline = eval_func(graph.program)
 
-            param_backup = {}
             pruner = Pruner()
             _logger.info("sensitive - param: {}; ratios: {}".format(name,
                                                                     ratio))
-            pruned_program = pruner.prune(
+            pruned_program, param_backup, _ = pruner.prune(
                 program=graph.program,
                 scope=scope,
                 params=[name],
@@ -75,21 +69,21 @@ def sensitivity(program,
                 place=place,
                 lazy=True,
                 only_graph=False,
-                param_backup=param_backup)
+                param_backup=True)
             pruned_metric = eval_func(pruned_program)
             loss = (baseline - pruned_metric) / baseline
             _logger.info("pruned param: {}; {}; loss={}".format(name, ratio,
                                                                 loss))
+
             sensitivities[name]['pruned_percent'].append(ratio)
             sensitivities[name]['loss'].append(loss)
+
             _save_sensitivities(sensitivities, sensitivities_file)
 
             # restore pruned parameters
             for param_name in param_backup.keys():
                 param_t = scope.find_var(param_name).get_tensor()
                 param_t.set(param_backup[param_name], place)
-            ratio += step_size
-            pruned_times += 1
     return sensitivities
 
 
@@ -121,7 +115,7 @@ def flops_sensitivity(program,
     baseline = None
     for name in sensitivities:
 
-        pruned_program = pruner.prune(
+        pruned_program, _, _ = pruner.prune(
             program=graph.program,
             scope=None,
             params=[name],
diff --git a/paddleslim/quant/quantization_api_doc.md b/paddleslim/quant/quantization_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..9607e60a8191d63e269ac98eae8bf77d2579dabe
--- /dev/null
+++ b/paddleslim/quant/quantization_api_doc.md
@@ -0,0 +1,253 @@
+# paddleslim.quant API文档
+
+## 量化训练API
+
+### 量化配置
+```
+quant_config_default = {
+    'weight_quantize_type': 'abs_max',
+    'activation_quantize_type': 'abs_max',
+    'weight_bits': 8,
+    'activation_bits': 8,
+    # ops of name_scope in not_quant_pattern list, will not be quantized
+    'not_quant_pattern': ['skip_quant'],
+    # ops of type in quantize_op_types, will be quantized
+    'quantize_op_types':
+    ['conv2d', 'depthwise_conv2d', 'mul', 'elementwise_add', 'pool2d'],
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. defaulf is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # if set quant_weight_only True, then only quantize parameters of layers which need to be quantized,
+    # and activations will not be quantized.
+    'quant_weight_only': False
+}
+```
+设置量化训练需要的配置。
+
+**参数：**
+
+- **weight_quantize_type(str)** - 参数量化方式。可选``'abs_max'``,  ``'channel_wise_abs_max'``, ``'range_abs_max'``, ``'moving_average_abs_max'``。 默认``'abs_max'``。
+- **activation_quantize_type(str)** - 激活量化方式，可选``'abs_max'``, ``'range_abs_max'``, ``'moving_average_abs_max'``，默认``'abs_max'``。
+- **weight_bits(int)** - 参数量化bit数，默认8, 推荐设为8。
+- **activation_bits(int)** -  激活量化bit数，默认8， 推荐设为8。
+- **not_quant_pattern(str or list[str])** - 所有``name_scope``包含``'not_quant_pattern'``字符串的``op``，都不量化, 设置方式请参考``fluid.name_scope()``。
+- **quantize_op_types(list[str])** -  需要进行量化的``op``类型，目前支持``'conv2d', 'depthwise_conv2d', 'mul' ``。
+- **dtype(int8)** - 量化后的参数类型，默认 ``int8``, 目前仅支持``int8``。
+- **window_size(int)** -  ``'range_abs_max'``量化方式的``window size``，默认10000。
+- **moving_rate(int)** - ``'moving_average_abs_max'``量化方式的衰减系数，默认 0.9。
+- **quant_weight_only(bool)** - 是否只量化参数，如果设为``True``，则激活不进行量化，默认``False``。目前暂不支持设置为``True``。 设置为``True``时，只量化参数，这种方式不能减少显存占用和加速，只能用来减少带宽。
+
+
+### paddleslim.quant.quant_aware(program, place, config, scope=None, for_test=False)
+在``program``中加入量化和反量化``op``, 用于量化训练。
+
+
+**参数：**
+
+* **program (fluid.Program)** -  传入训练或测试``program``。
+* **place(fluid.CPUPlace or fluid.CUDAPlace)** -  该参数表示``Executor``执行所在的设备。
+* **config(dict)** -  量化配置表。
+* **scope(fluid.Scope, optional)** -  传入用于存储``Variable``的``scope``，需要传入``program``所使用的``scope``，一般情况下，是``fluid.global_scope()``。设置为``None``时将使用``fluid.global_scope()``，默认值为``None``。
+* **for_test(bool)** -  如果``program``参数是一个测试``program``，``for_test``应设为``True``，否则设为``False``。
+
+**返回**
+
+含有量化和反量化``operator``的``program``
+
+**返回类型**
+
+* 当``for_test=False``，返回类型为``fluid.CompiledProgram``， **注意，此返回值不能用于保存参数**。
+* 当``for_test=True``，返回类型为``fluid.Program``。
+
+**注意事项**
+
+* 此接口会改变``program``结构，并且可能增加一些``persistable``的变量，所以加载模型参数时请注意和相应的``program``对应。
+* 此接口底层经历了``fluid.Program``-> ``fluid.framework.IrGraph``->``fluid.Program``的转变，在``fluid.framework.IrGraph``中没有``Parameter``的概念，``Variable``只有``persistable``和``not persistable``的区别，所以在保存和加载参数时，请使用``fluid.io.save_persistables``和``fluid.io.load_persistables``接口。
+* 由于此接口会根据``program``的结构和量化配置来对``program``添加op，所以``Paddle``中一些通过``fuse op``来加速训练的策略不能使用。已知以下策略在使用量化时必须设为``False``： ``fuse_all_reduce_ops, sync_batch_norm``。
+* 如果传入的``program``中存在和任何op都没有连接的``Variable``，则会在量化的过程中被优化掉。
+
+
+
+### paddleslim.quant.convert(program, place, config, scope=None, save_int8=False)
+
+
+把训练好的量化``program``，转换为可用于保存``inference model``的``program``。
+
+**参数：**
+- **program (fluid.Program)** -  传入测试``program``。
+- **place(fluid.CPUPlace or fluid.CUDAPlace)** - 该参数表示``Executor``执行所在的设备。
+- **config(dict)** -  量化配置表。
+- **scope(fluid.Scope)** - 传入用于存储``Variable``的``scope``，需要传入``program``所使用的``scope``，一般情况下，是``fluid.global_scope()``。设置为``None``时将使用``fluid.global_scope()``，默认值为``None``。
+- **save_int8（bool)** -  是否需要返回参数为``int8``的``program``。该功能目前只能用于确认模型大小。默认值为``False``。
+
+**返回**
+
+- **program (fluid.Program)** - freezed program，可用于保存inference model，参数为``float32``类型，但其数值范围可用int8表示。
+- **int8_program (fluid.Program)** - freezed program，可用于保存inference model，参数为``int8``类型。当``save_int8``为``False``时，不返回该值。
+
+**注意事项**
+
+因为该接口会对``op``和``Variable``做相应的删除和修改，所以此接口只能在训练完成之后调用。如果想转化训练的中间模型，可加载相应的参数之后再使用此接口。
+
+**代码示例**
+
+```python
+#encoding=utf8
+import paddle.fluid as fluid
+import paddleslim.quant as quant
+
+
+train_program = fluid.Program()
+
+with fluid.program_guard(train_program):
+    image = fluid.data(name='x', shape=[None, 1, 28, 28])
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    conv = fluid.layers.conv2d(image, 32, 1)
+    feat = fluid.layers.fc(conv, 10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=feat, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+use_gpu = True
+place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+eval_program = train_program.clone(for_test=True)
+#配置
+config = {'weight_quantize_type': 'abs_max',
+        'activation_quantize_type': 'moving_average_abs_max'}
+build_strategy = fluid.BuildStrategy()
+exec_strategy = fluid.ExecutionStrategy()
+#调用api
+quant_train_program = quant.quant_aware(train_program, place, config, for_test=False)
+quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True)
+#关闭策略
+build_strategy.fuse_all_reduce_ops = False
+build_strategy.sync_batch_norm = False
+quant_train_program = quant_train_program.with_data_parallel(
+    loss_name=avg_cost.name,
+    build_strategy=build_strategy,
+    exec_strategy=exec_strategy)
+
+inference_prog = quant.convert(quant_eval_program, place, config)
+```
+
+更详细的用法请参考 <a href='../../demo/quant/quant_aware/README.md'>量化训练demo</a>。
+
+## 离线量化API
+```
+paddleslim.quant.quant_post(executor,
+           model_dir,
+           quantize_model_path,
+           sample_generator,
+           model_filename=None,
+           params_filename=None,
+           batch_size=16,
+           batch_nums=None,
+           scope=None,
+           algo='KL',
+           quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"])
+
+```
+对保存在``${model_dir}``下的模型进行量化，使用``sample_generator``的数据进行参数校正。
+
+**参数:**
+- **executor (fluid.Executor)** - 执行模型的executor，可以在cpu或者gpu上执行。
+- **model_dir（str)** - 需要量化的模型所在的文件夹。
+- **quantize_model_path(str)** - 保存量化后的模型的路径
+- **sample_generator(python generator)** - 读取数据样本，每次返回一个样本。
+- **model_filename(str, optional)** - 模型文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``model_filename``为模型文件的名称，否则设置为``None``即可。默认值是``None``。
+- **params_filename(str)** - 参数文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``params_filename``为参数文件的名称，否则设置为``None``即可。默认值是``None``。
+- **batch_size(int)** - 每个batch的图片数量。默认值为16 。
+- **batch_nums(int, optional)** - 迭代次数。如果设置为``None``，则会一直运行到``sample_generator`` 迭代结束， 否则，迭代次数为``batch_nums``, 也就是说参与对``Scale``进行校正的样本个数为 ``'batch_nums' * 'batch_size' ``.
+- **scope(fluid.Scope, optional)** - 用来获取和写入``Variable``, 如果设置为``None``,则使用``fluid.global_scope()``. 默认值是``None``.
+- **algo(str)** - 量化时使用的算法名称，可为``'KL'``或者``'direct'``。该参数仅针对激活值的量化，因为参数值的量化使用的方式为``'channel_wise_abs_max'``. 当``algo`` 设置为``'direct'``时，使用校正数据的激活值的绝对值的最大值当作``Scale``值，当设置为``'KL'``时，则使用``KL``散度的方法来计算``Scale``值。默认值为``'KL'``。
+- **quantizable_op_type(list[str])** -  需要量化的``op``类型列表。默认值为``["conv2d", "depthwise_conv2d", "mul"]``。
+
+**返回**
+
+无。
+
+**注意事项**
+
+因为该接口会收集校正数据的所有的激活值，所以使用的校正图片不能太多。``'KL'``散度的计算也比较耗时。
+
+**代码示例**
+
+> 注： 此示例不能直接运行，因为需要加载``${model_dir}``下的模型，所以不能直接运行。
+
+```python
+import paddle.fluid as fluid
+import paddle.dataset.mnist as reader
+from paddleslim.quant import quant_post
+val_reader = reader.train()
+use_gpu = True
+place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+
+exe = fluid.Executor(place)
+quant_post(
+        executor=exe,
+        model_dir='./model_path',
+        quantize_model_path='./save_path',
+        sample_generator=val_reader,
+        model_filename='__model__',
+        params_filename='__params__',
+        batch_size=16,
+        batch_nums=10)
+```
+更详细的用法请参考 <a href='../../demo/quant/quant_post/README.md'>离线量化demo</a>。
+
+## Embedding量化API
+```
+paddleslim.quant.quant_embedding(program, place, config, scope=None)
+```
+对``Embedding``参数进行量化。
+
+**参数:**
+- **program(fluid.Program)** - 需要量化的program
+- **scope(fluid.Scope, optional)** - 用来获取和写入``Variable``, 如果设置为``None``,则使用``fluid.global_scope()``.
+- **place(fluid.CPUPlace or fluid.CUDAPlace)** - 运行program的设备
+- **config(dict)** - 定义量化的配置。可以配置的参数有：
+    - ``'params_name'`` (str, required): 需要进行量化的参数名称，此参数必须设置。
+    - ``'quantize_type'`` (str, optional): 量化的类型，目前支持的类型是``'abs_max'``, 待支持的类型有 ``'log', 'product_quantization'``。 默认值是``'abs_max'``.
+    - ``'quantize_bits'``（int, optional): 量化的``bit``数，目前支持的``bit``数为8。默认值是8.
+    - ``'dtype'``(str, optional): 量化之后的数据类型， 目前支持的是``'int8'``. 默认值是``int8``。
+    - ``'threshold'``(float, optional): 量化之前将根据此阈值对需要量化的参数值进行``clip``. 如果不设置，则跳过``clip``过程直接量化。
+
+**返回**
+
+量化之后的program
+
+**返回类型**
+
+``fluid.Program``
+
+**代码示例**
+```python
+import paddle.fluid as fluid
+import paddleslim.quant as quant
+
+train_program = fluid.Program()
+with fluid.program_guard(train_program):
+    input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
+    input_emb = fluid.embedding(
+        input=input_word,
+        is_sparse=False,
+        size=[100, 128],
+        param_attr=fluid.ParamAttr(name='emb',
+        initializer=fluid.initializer.Uniform(-0.005, 0.005)))
+
+infer_program = train_program.clone(for_test=True)
+
+use_gpu = True
+place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+
+config = {'params_name': 'emb', 'quantize_type': 'abs_max'}
+quant_program = quant.quant_embedding(infer_program, place, config)
+```
+
+更详细的用法请参考 <a href='../../demo/quant/quant_embedding/README.md'>Embedding量化demo</a>。
diff --git a/paddleslim/version.py b/paddleslim/version.py
index f7ed0174edd9207b6d582008761420c12dec8018..3e95d57aa8f5b558d818dfff8b5d85daba2c6068 100644
--- a/paddleslim/version.py
+++ b/paddleslim/version.py
@@ -14,4 +14,4 @@
 # limitations under the License.
 """ PaddleSlim version string """
 __all__ = ["slim_version"]
-slim_version = "4.0"
+slim_version = "0.1"
diff --git a/tests/test_prune.py b/tests/test_prune.py
index 3fdaa867e350af876648871f83fe70cc83b548b6..931cf9cf35429a1aa9ca53b5b5d8444f71d3ec39 100644
--- a/tests/test_prune.py
+++ b/tests/test_prune.py
@@ -50,7 +50,7 @@ class TestPrune(unittest.TestCase):
         scope = fluid.Scope()
         exe.run(startup_program, scope=scope)
         pruner = Pruner()
-        main_program = pruner.prune(
+        main_program, _, _ = pruner.prune(
             main_program,
             scope,
             params=["conv4_weights"],