Rename quant api, update quant doc, test=develop, test=document (#338)

39700126 · cc · GitHub · 2ff43527 · 39700126 · 39700126
19 changed file
--- a/README.md
+++ b/README.md
@@ -110,7 +110,7 @@ PaddleSlim会从底层能力、技术咨询合作和业务场景等角度支持
 						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/quant_aware_tutorial.md" target="_blank">量化训练快速开始示例</a>
 					</li>
 					<li>
-						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/quant_post_tutorial.md" target="_blank">离线量化快速开始示例</a>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/quant_post_static_tutorial.md" target="_blank">静态离线量化快速开始示例</a>
 					</li>
 					<li>
 						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md" target="_blank">检测模型量化教程</a>

--- a/demo/mkldnn_quant/quant_aware/PaddleCV_mkldnn_quantaware_tutorial_cn.md
+++ b/demo/mkldnn_quant/quant_aware/PaddleCV_mkldnn_quantaware_tutorial_cn.md
@@ -40,9 +40,9 @@ import numpy as np
 - **quantize_op_types:** 目前CPU上支持量化 `depthwise_conv2d`, `mul`, `conv2d`, `matmul`, `transpose2`, `reshape2`, `pool2d`, `scale`。但是训练阶段插入fake quantize/dequantize op时，只需在前四种op前后插入fake quantize/dequantize ops，因为后面四种op `matmul`, `transpose2`, `reshape2`, `pool2d`的输入输出scale不变，将从前后方op的输入输出scales获得scales,所以`quantize_op_types` 参数只需要 `depthwise_conv2d`, `mul`, `conv2d`, `matmul` 即可。
 - **其他参数:** 请参考 [PaddleSlim quant_aware API](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/#quant_aware)

-#### 2.2 离线量化
+#### 2.2 静态离线量化

-离线量化模型产出可以参考[分类模型的离线量化流程](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_post_demo/#_1)
+静态离线量化模型产出可以参考[分类模型的静态离线量化流程](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_post_demo/#_1)

 ## 3. 转化产出的量化模型为DNNL优化后的INT8模型
 为了部署在CPU上，我们将保存的quant模型，通过一个转化脚本，移除fake quantize/dequantize op，fuse一些op，并且完全转化成 INT8 模型。需要使用Paddle所在目录运行下面的脚本，脚本在官网的位置为[save_qat_model.py](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/save_qat_model.py)。复制脚本到demo所在目录下(`/PATH_TO_PaddleSlim/demo/mkldnn_quant/quant_aware/`)并执行如下命令：

--- a/demo/quant/quant_aware/train.py
+++ b/demo/quant/quant_aware/train.py
@@ -12,7 +12,7 @@ sys.path[0] = os.path.join(
    os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
 from paddleslim.common import get_logger
 from paddleslim.analysis import flops
-from paddleslim.quant import quant_aware, quant_post, convert
+from paddleslim.quant import quant_aware, convert
 import models
 from utility import add_arguments, print_arguments


--- a/demo/quant/quant_post/README.md
+++ b/demo/quant/quant_post/README.md
-# 离线量化示例
+# 静态离线量化示例

-本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。
+本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post_static``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。

 ## 接口介绍

-请参考 <a href='https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-post'>量化API文档</a>。
+请参考 <a href='https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-post-static'>量化API文档</a>。

 ## 分类模型的离线量化流程

@@ -30,10 +30,10 @@ python export_model.py --model "MobileNet" --pretrained_model ./pretrain/MobileN
 ```
 转化之后的模型存储在``inference_model/MobileNet/``文件夹下，可看到该文件夹下有``'model'``, ``'weights'``两个文件。

-### 离线量化
-接下来对导出的模型文件进行离线量化，离线量化的脚本为[quant_post.py](./quant_post.py)，脚本中使用接口``paddleslim.quant.quant_post``对模型进行离线量化。运行命令为：
+### 静态离线量化
+接下来对导出的模型文件进行静态离线量化，静态离线量化的脚本为[quant_post.py](./quant_post.py)，脚本中使用接口``paddleslim.quant.quant_post_static``对模型进行离线量化。运行命令为：
 ```
-python quant_post.py --model_path ./inference_model/MobileNet --save_path ./quant_model_train/MobileNet --model_filename model --params_filename weights
+python quant_post_static.py --model_path ./inference_model/MobileNet --save_path ./quant_model_train/MobileNet --model_filename model --params_filename weights
 ```

 - ``model_path``: 需要量化的模型坐在的文件夹

--- a/demo/slimfacenet/train_eval.py
+++ b/demo/slimfacenet/train_eval.py
@@ -27,7 +27,7 @@ from dataloader.casia import CASIA_Face
 from dataloader.lfw import LFW
 from lfw_eval import parse_filelist, evaluation_10_fold
 from paddleslim import models
-from paddleslim.quant import quant_post
+from paddleslim.quant import quant_post_static


 def now():
@@ -331,7 +331,7 @@ def main():
    if args.action == 'train':
        train(exe, train_program, train_out, test_program, test_out, args)
    elif args.action == 'quant':
-        quant_post(
+        quant_post_static(
            executor=exe,
            model_dir='./out_inference/',
            quantize_model_path='./quant_model/',

--- a/docs/en/quick_start/index_en.rst
+++ b/docs/en/quick_start/index_en.rst
@@ -8,5 +8,5 @@ Quick Start
   pruning_tutorial_en.md
   nas_tutorial_en.md
   quant_aware_tutorial_en.md
-   quant_post_tutorial_en.md
+   quant_post_static_tutorial_en.md
    
--- a/docs/en/quick_start/quant_post_tutorial_en.md
+++ b/docs/en/quick_start/quant_post_tutorial_en.md
--- a/docs/zh_cn/api_cn/quantization_api.rst
+++ b/docs/zh_cn/api_cn/quantization_api.rst
--- a/docs/zh_cn/intro.md
+++ b/docs/zh_cn/intro.md
@@ -19,7 +19,8 @@ PaddleSlim会从底层能力、技术咨询合作和业务场景等角度支持

 - 定点量化
  - 在线量化训练（training aware）
-  - 离线量化（post training）
+  - 静态离线量化（static post training）
+  - 动态离线量化（dynamic post training）

 - 知识蒸馏
  - 支持单进程知识蒸馏

--- a/docs/zh_cn/quick_start/index.rst
+++ b/docs/zh_cn/quick_start/index.rst
@@ -8,6 +8,6 @@
   pruning_tutorial.md
   distillation_tutorial.md
   quant_aware_tutorial.md
-   quant_post_tutorial.md
+   quant_post_static_tutorial.md
   nas_tutorial.md
    
--- a/docs/zh_cn/quick_start/quant_aware_tutorial.md
+++ b/docs/zh_cn/quick_start/quant_aware_tutorial.md
 # 图像分类模型量化训练-快速开始

-该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[量化训练接口](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html)。 该示例包含以下步骤：
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[量化训练接口](../api_cn/quantization_api.html)。 该示例包含以下步骤：

 1. 导入依赖
 2. 构建模型

--- a/docs/zh_cn/quick_start/quant_post_tutorial.md
+++ b/docs/zh_cn/quick_start/quant_post_tutorial.md
- # 图像分类模型离线量化-快速开始
+ # 图像分类模型静态离线量化-快速开始

-该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[离线量化接口](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-post)。 该示例包含以下步骤：
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[静态离线量化接口](../api_cn/quantization_api.html#quant-post-static)。 该示例包含以下步骤：

 1. 导入依赖
 2. 构建模型
 3. 训练模型
-4. 离线量化
+4. 静态离线量化

 ## 1. 导入依赖
 PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
@@ -90,7 +90,7 @@ test(val_program)
 ```


-保存inference model，将训练好的分类模型保存在``'./inference_model'``下，后续进行离线量化时将加载保存在此处的模型。
+保存inference model，将训练好的分类模型保存在``'./inference_model'``下，后续进行静态离线量化时将加载保存在此处的模型。


 ```python
@@ -102,29 +102,29 @@ fluid.io.save_inference_model(dirname='./inference_model',
        main_program=val_program)
 ```

-## 4. 离线量化
+## 4. 静态离线量化

-调用离线量化接口，加载文件夹``'./inference_model'``训练好的分类模型，并使用10个batch的数据进行参数校正。此过程无需训练，只需跑前向过程来计算量化所需参数。离线量化后的模型保存在文件夹``'./quant_post_model'``下。
+调用静态离线量化接口，加载文件夹``'./inference_model'``训练好的分类模型，并使用10个batch的数据进行参数校正。此过程无需训练，只需跑前向过程来计算量化所需参数。静态离线量化后的模型保存在文件夹``'./quant_post_static_model'``下。


 ```python
-slim.quant.quant_post(
+slim.quant.quant_post_static(
        executor=exe,
        model_dir='./inference_model',
-        quantize_model_path='./quant_post_model',
+        quantize_model_path='./quant_post_static_model',
        sample_generator=reader.test(),
        batch_nums=10)
 ```


-加载保存在文件夹``'./quant_post_model'``下的量化后的模型进行测试，可看到精度和``3.2 训练和测试``中得到的测试精度相近，因此离线量化过程对于此分类模型几乎无损。
+加载保存在文件夹``'./quant_post_static_model'``下的量化后的模型进行测试，可看到精度和``3.2 训练和测试``中得到的测试精度相近，因此静态离线量化过程对于此分类模型几乎无损。


 ```python
-quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
-        dirname='./quant_post_model',
+quant_post_static_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
+        dirname='./quant_post_static_model',
        model_filename='__model__',
        params_filename='__params__',
        executor=exe)
-test(quant_post_prog, fetch_targets)
+test(quant_post_static_prog, fetch_targets)
 ```
--- a/docs/zh_cn/tutorials/image_classification_mkldnn_quant_aware_tutorial.md
+++ b/docs/zh_cn/tutorials/image_classification_mkldnn_quant_aware_tutorial.md
-# CPU部署预测INT8模型
+# CPU上部署量化模型教程

 在Intel(R) Xeon(R) Gold 6271机器上，经过量化和DNNL加速，INT8模型在单线程上性能为原FP32模型的3~4倍；在 Intel(R) Xeon(R) Gold 6148，单线程性能为原FP32模型的1.5倍，而精度仅有极小下降。图像分类量化的样例教程请参考[图像分类INT8模型在CPU优化部署和预测](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/mkldnn_quant/quant_aware/PaddleCV_mkldnn_quantaware_tutorial_cn.md)。自然语言处理模型的量化请参考[ERNIE INT8 模型精度与性能复现](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn)


--- a/docs/zh_cn/tutorials/index.rst
+++ b/docs/zh_cn/tutorials/index.rst
@@ -6,11 +6,11 @@
   :maxdepth: 1

   image_classification_sensitivity_analysis_tutorial.md
-   image_classification_mkldnn_quant_aware_tutorial.md
   darts_nas_turorial.md
   paddledetection_slim_distillation_tutorial.md
   paddledetection_slim_nas_tutorial.md
   paddledetection_slim_pruing_tutorial.md
   paddledetection_slim_prune_dist_tutorial.md
   paddledetection_slim_quantization_tutorial.md
+   image_classification_mkldnn_quant_aware_tutorial.md
   paddledetection_slim_sensitivy_tutorial.md
--- a/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md
+++ b/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md
-# 目标检测模型定点量化教程
+# 目标检测模型量化教程

 教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/quantization/README.md

@@ -7,7 +7,7 @@

 ### 训练策略

- 量化策略`post`为使用离线量化得到的模型，`aware`为在线量化训练得到的模型。
+- 量化策略`post`为使用静态离线量化方法得到的模型，`aware`为在线量化训练方法得到的模型。

 ### YOLOv3 on COCO


--- a/paddleslim/quant/__init__.py
+++ b/paddleslim/quant/__init__.py
@@ -21,7 +21,8 @@ _logger = get_logger(__name__, level=logging.INFO)

 try:
    fluid.require_version('2.0.0')
-    from .quanter import quant_aware, quant_post, convert, quant_post_only_weight
+    from .quanter import quant_aware, convert, quant_post_static, quant_post_dynamic
+    from .quanter import quant_post, quant_post_only_weight
 except Exception as e:
    _logger.warning(
        "If you want to use training-aware and post-training quantization, "

--- a/paddleslim/quant/quanter.py
+++ b/paddleslim/quant/quanter.py
@@ -233,7 +233,7 @@ def quant_aware(program, place, config=None, scope=None, for_test=False):
    return quant_program


-def quant_post(executor,
+def quant_post_static(executor,
               model_dir,
               quantize_model_path,
               batch_generator=None,
@@ -255,10 +255,10 @@ def quant_post(executor,
               is_use_cache_file=False,
               cache_dir="./temp_post_training"):
    """
-    The function utilizes post training quantization method to quantize the 
-    fp32 model. It uses calibrate data to calculate the scale factor of 
-    quantized variables, and inserts fake quantization and dequantization 
-    operators to obtain the quantized model.
+    The function utilizes static post training quantization method to
+    quantize the fp32 model. It uses calibrate data to calculate the
+    scale factor of quantized variables, and inserts fake quantization
+    and dequantization operators to obtain the quantized model.

    Args:
        executor(fluid.Executor): The executor to load, run and save the 
@@ -340,26 +340,40 @@ def quant_post(executor,
        model_filename=save_model_filename,
        params_filename=save_params_filename)

+# We have changed the quant_post to quant_post_static.
+# For compatibility, we keep quant_post api for now, and it will be
+# deprecated in the future.
+quant_post = quant_post_static
+

 def convert(program, place, config=None, scope=None, save_int8=False):
    """
-    convert quantized and well-trained ``program`` to final  quantized ``program`` that can be used to  save ``inference model``.
+    convert quantized and well-trained ``program`` to final  quantized
+    ``program``that can be used to  save ``inference model``.
    
    Args:
        program(fluid.Program): quantized and well-trained ``test program``.
-        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents the executor run on which device.
-        config(dict, optional): configs for convert. if set None, will use default config. 
-            It must be same with config that used in 'quant_aware'. Default: None.
-        scope(fluid.Scope, optional):  Scope records the mapping between variable names and variables, 
-            similar to brackets in programming languages. Usually users can use 
-            `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.              When ``None`` will use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``.
-        save_int8: Whether to return ``program`` which model parameters' dtype is ``int8``. 
-            This parameter can only be used to get model size. Default: ``False``.
+        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents
+                the executor run on which device.
+        config(dict, optional): configs for convert. if set None, will use
+                default config. It must be same with config that used in
+                'quant_aware'. Default is None.
+        scope(fluid.Scope, optional):  Scope records the mapping between
+                variable names and variables, similar to brackets in
+                programming languages. Usually users can use
+                `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.
+                When ``None`` will use 
+                `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_
+                . Default: ``None``.
+        save_int8: Whether to return ``program`` which model parameters'
+                dtype is ``int8``. This parameter can only be used to
+                get model size. Default: ``False``.

    Returns:
        Tuple : freezed program which can be used for inference.
-        when ``save_int8`` is False, return ``freezed_program(fluid.Program)``.
-        when ``save_int8`` is True, return ``freezed_program(fluid.Program)`` and ``freezed_program_int8(fluid.Program)``
+                when ``save_int8`` is False, return ``freezed_program(fluid.Program)``.
+                when ``save_int8`` is True, return ``freezed_program(fluid.Program)``
+                and ``freezed_program_int8(fluid.Program)``
    """
    scope = fluid.global_scope() if not scope else scope

@@ -395,7 +409,7 @@ def convert(program, place, config=None, scope=None, save_int8=False):
        return freezed_program


-def quant_post_only_weight(model_dir,
+def quant_post_dynamic(model_dir,
                           save_model_dir,
                           model_filename=None,
                           params_filename=None,
@@ -405,19 +419,26 @@ def quant_post_only_weight(model_dir,
                           weight_bits=8,
                           generate_test_model=False):
    '''
-    In order to reduce the size of model, this api quantizes the weight
-    of some ops from float32 to int8/16. In the inference stage, the 
-    quantized weight will be dequantized to float32 again.
+    The function utilizes static post training quantization method to
+    quantize the fp32 model. In details, it quantizes the weight of some
+    ops from float32 to int8/16. For the quantized model, there are two
+    kinds of calculation method in the reference stage. Firstly, the
+    quantized weight will be dequantized to float32, and then apply the
+    float32 calculation. Secondly, collect the quantized scales of the
+    inputs, and then apply the int8 calculation.
        
    Args:
        model_dir(str): The path of the fp32 model that will be quantized,
-                    and the model and params files are under the path.
+                and the model and params files are under the path.
        save_model_dir(str): The path to save the quantized model.
-        model_filename(str, optional): The name of file used to load the inference
-                    program. If it is None, the default filename '__model__' will be used. Default is 'None'.
-        params_filename(str, optional): The name of file used to load all parameters. When all parameters were saved 
-                in a single binary file, set it as the real filename. If parameters were saved in separate files,
-                set it as 'None'. Default is 'None'.
+        model_filename(str, optional): The name of file used to load the
+                inference program. If it is None, the default filename
+                '__model__' will be used. Default is 'None'.
+        params_filename(str, optional): The name of file used to load all
+                parameters. When all parameters were saved in a single
+                binary file, set it as the real filename. If parameters
+                were saved in separate files, set it as 'None'. Default is
+                'None'.
        save_model_dir(str): The path used to save the quantized model.
        save_model_filename(str, optional): The name of file to 
                save the inference program. If it is None, the default 
@@ -442,6 +463,7 @@ def quant_post_only_weight(model_dir,
        model_dir=model_dir,
        model_filename=model_filename,
        params_filename=params_filename)
+
    weight_quant.quantize_weight_to_int(
        save_model_dir=save_model_dir,
        save_model_filename=save_model_filename,
@@ -449,3 +471,9 @@ def quant_post_only_weight(model_dir,
        quantizable_op_type=quantizable_op_type,
        weight_bits=weight_bits,
        generate_test_model=generate_test_model)
+
+
+# We have changed the quant_post_only_weight to quant_post_dynamic.
+# For compatibility, we keep quant_post_only_weight api for now,
+# and it will be deprecated in the future.
+quant_post_only_weight = quant_post_dynamic
\ No newline at end of file
--- a/tests/test_quant_post.py
+++ b/tests/test_quant_post.py
@@ -16,7 +16,7 @@ sys.path.append("../")
 import unittest
 import paddle
 import paddle.fluid as fluid
-from paddleslim.quant import quant_post
+from paddleslim.quant import quant_post_static
 sys.path.append("../demo")
 from models import MobileNet
 from layers import conv_bn_layer
@@ -98,7 +98,7 @@ class TestQuantAwareCase1(unittest.TestCase):
            model_filename='model',
            params_filename='params')

-        quant_post(
+        quant_post_static(
            exe,
            './test_quant_post',
            './test_quant_post_inference',

--- a/tests/test_quant_post_only_weight.py
+++ b/tests/test_quant_post_only_weight.py
@@ -16,7 +16,7 @@ sys.path.append("../")
 import unittest
 import paddle
 import paddle.fluid as fluid
-from paddleslim.quant import quant_post_only_weight
+from paddleslim.quant import quant_post_dynamic
 sys.path.append("../demo")
 from models import MobileNet
 from layers import conv_bn_layer
@@ -90,7 +90,7 @@ class TestQuantPostOnlyWeightCase1(unittest.TestCase):
        train(main_prog)
        top1_1, top5_1 = test(val_prog)
        fluid.io.save_inference_model(
-            dirname='./test_quant_post',
+            dirname='./test_quant_post_dynamic',
            feeded_var_names=[image.name, label.name],
            target_vars=[avg_cost, acc_top1, acc_top5],
            main_program=val_prog,
@@ -98,7 +98,7 @@ class TestQuantPostOnlyWeightCase1(unittest.TestCase):
            model_filename='model',
            params_filename='params')

-        quant_post_only_weight(
+        quant_post_dynamic(
            model_dir='./test_quant_post',
            save_model_dir='./test_quant_post_inference',
            model_filename='model',