From d3d94d15a92453dfba6345feca0314a28bb2bb32 Mon Sep 17 00:00:00 2001
From: Liufang Sang <slf12thuss@163.com>
Date: Wed, 13 May 2020 04:41:39 -0500
Subject: [PATCH] [cherry-pick] refine quant and add out scale pass (#275)

* cherry-pick update quant_aware and quant_post for paddle version 2.0 (#244)

* cherry-pick add out scale for quantization (#272)

* add out scale for quantization

* update quant_aware unittest

* update quant_aware unittest
---
 paddleslim/quant/__init__.py |  2 +-
 paddleslim/quant/quanter.py  | 58 ++++++++++++++++++++++++++----------
 tests/test_quant_aware.py    |  5 ++--
 tests/test_quant_post.py     |  7 +++--
 4 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/paddleslim/quant/__init__.py b/paddleslim/quant/__init__.py
index 31199ef1..5eaac4e1 100644
--- a/paddleslim/quant/__init__.py
+++ b/paddleslim/quant/__init__.py
@@ -25,6 +25,6 @@ try:
 except Exception as e:
     _logger.warning(
         "If you want to use training-aware and post-training quantization, "
-        "please use Paddle >= 1.7.0 or develop version")
+        "please use Paddle >= 2.0.0 or develop version")
 
 from .quant_embedding import quant_embedding
diff --git a/paddleslim/quant/quanter.py b/paddleslim/quant/quanter.py
index f3afbd63..a2497d76 100755
--- a/paddleslim/quant/quanter.py
+++ b/paddleslim/quant/quanter.py
@@ -24,6 +24,8 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
 from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
+from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass
+from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
 from paddle.fluid import core
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
 
@@ -46,8 +48,8 @@ ACTIVATION_QUANTIZATION_TYPES_TENSORRT = [
 
 VALID_DTYPES = ['int8']
 TRANSFORM_PASS_OP_TYPES = QuantizationTransformPass._supported_quantizable_op_type
-QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type + \
-        AddQuantDequantPass._activation_type
+QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type
+
 TENSORRT_OP_TYPES = [
     'mul', 'conv2d', 'pool2d', 'depthwise_conv2d', 'elementwise_add',
     'leaky_relu'
@@ -220,6 +222,10 @@ def quant_aware(program, place, config=None, scope=None, for_test=False):
             quantizable_op_type=quant_dequant_ops)
         quant_dequant_pass.apply(main_graph)
 
+    out_scale_training_pass = OutScaleForTrainingPass(
+        scope=scope, place=place, moving_rate=config['moving_rate'])
+    out_scale_training_pass.apply(main_graph)
+
     if for_test:
         quant_program = main_graph.to_program()
     else:
@@ -230,9 +236,12 @@ def quant_aware(program, place, config=None, scope=None, for_test=False):
 def quant_post(executor,
                model_dir,
                quantize_model_path,
-               sample_generator,
+               batch_generator=None,
+               sample_generator=None,
                model_filename=None,
                params_filename=None,
+               save_model_filename='__model__',
+               save_params_filename='__params__',
                batch_size=16,
                batch_nums=None,
                scope=None,
@@ -241,6 +250,8 @@ def quant_post(executor,
                is_full_quantize=False,
                weight_bits=8,
                activation_bits=8,
+               activation_quantize_type='range_abs_max',
+               weight_quantize_type='channel_wise_abs_max',
                is_use_cache_file=False,
                cache_dir="./temp_post_training"):
     """
@@ -257,6 +268,10 @@ def quant_post(executor,
             are under the path.
         quantize_model_path(str): The path to save quantized model using api
             ``fluid.io.save_inference_model``.
+        batch_generator(Python Generator): The batch generator provides 
+                calibrate data for DataLoader, and it returns a batch every
+                time. For sample_generator and batch_generator, only one
+                can be set. Beisdes, batch_generator supports lod tensor.
         sample_generator(Python Generator): The sample generator provides 
             calibrate data for DataLoader, and it only returns a sample every time.
         model_filename(str, optional): The name of model file. If parameters 
@@ -265,6 +280,9 @@ def quant_post(executor,
                 When all parameters are saved in a single file, set it 
                 as filename. If parameters are saved in separate files, 
                 set it as 'None'. Default : 'None'.
+        save_model_filename(str): The name of model file to save the quantized inference program.  Default: '__model__'.
+        save_params_filename(str): The name of file to save all related parameters. 
+                If it is set None, parameters will be saved in separate files. Default: '__params__'.
         batch_size(int, optional): The batch size of DataLoader, default is 16.
         batch_nums(int, optional): If batch_nums is not None, the number of calibrate 
                         data is 'batch_size*batch_nums'. If batch_nums is None, use all data
@@ -279,6 +297,15 @@ def quant_post(executor,
                         "mul"].
         weight_bits(int, optional): quantization bit number for weights.
         activation_bits(int): quantization bit number for activation.
+	activation_quantize_type(str): quantization type for activation,
+                now support 'range_abs_max', 'moving_average_abs_max' and 'abs_max'.
+                This parameter only specifies the fake ops in quantized model.
+                If it is 'range_abs_max' or 'moving_average_abs_max', we save the scale
+                obtained by post training quantization in fake ops. If it
+                is 'abs_max', the scale will not be saved in fake ops.
+        weight_quantize_type(str): quantization type for weights,
+                support 'abs_max' and 'channel_wise_abs_max'. Compared to 'abs_max',
+                the model accuracy is usually higher when using 'channel_wise_abs_max'.
         is_full_quantize(bool): if True, apply quantization to all supported quantizable op type.
                         If False, only apply quantization to the input quantizable_op_type. Default is False.
         is_use_cache_file(bool): If False, all temp data will be saved in memory. If True,
@@ -291,6 +318,7 @@ def quant_post(executor,
     post_training_quantization = PostTrainingQuantization(
         executor=executor,
         sample_generator=sample_generator,
+        batch_generator=batch_generator,
         model_dir=model_dir,
         model_filename=model_filename,
         params_filename=params_filename,
@@ -302,10 +330,15 @@ def quant_post(executor,
         is_full_quantize=is_full_quantize,
         weight_bits=weight_bits,
         activation_bits=activation_bits,
+        activation_quantize_type=activation_quantize_type,
+        weight_quantize_type=weight_quantize_type,
         is_use_cache_file=is_use_cache_file,
         cache_dir=cache_dir)
     post_training_quantization.quantize()
-    post_training_quantization.save_quantized_model(quantize_model_path)
+    post_training_quantization.save_quantized_model(
+        quantize_model_path,
+        model_filename=save_model_filename,
+        params_filename=save_params_filename)
 
 
 def convert(program, place, config=None, scope=None, save_int8=False):
@@ -336,12 +369,10 @@ def convert(program, place, config=None, scope=None, save_int8=False):
         assert isinstance(config, dict), "config must be dict"
         config = _parse_configs(config)
     _logger.info("convert config {}".format(config))
-
     test_graph = IrGraph(core.Graph(program.desc), for_test=True)
-    support_op_types = []
-    for op in config['quantize_op_types']:
-        if op in QuantizationFreezePass._supported_quantizable_op_type:
-            support_op_types.append(op)
+
+    out_scale_infer_pass = OutScaleForInferencePass(scope=scope)
+    out_scale_infer_pass.apply(test_graph)
 
     # Freeze the graph after training by adjusting the quantize
     # operators' order for the inference.
@@ -350,16 +381,13 @@ def convert(program, place, config=None, scope=None, save_int8=False):
         place=place,
         weight_bits=config['weight_bits'],
         activation_bits=config['activation_bits'],
-        weight_quantize_type=config['weight_quantize_type'],
-        quantizable_op_type=support_op_types)
+        weight_quantize_type=config['weight_quantize_type'])
+
     freeze_pass.apply(test_graph)
     freezed_program = test_graph.to_program()
 
     if save_int8:
-        convert_int8_pass = ConvertToInt8Pass(
-            scope=fluid.global_scope(),
-            place=place,
-            quantizable_op_type=support_op_types)
+        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
         convert_int8_pass.apply(test_graph)
         freezed_program_int8 = test_graph.to_program()
         return freezed_program, freezed_program_int8
diff --git a/tests/test_quant_aware.py b/tests/test_quant_aware.py
index 208944b0..16c93fe1 100644
--- a/tests/test_quant_aware.py
+++ b/tests/test_quant_aware.py
@@ -132,7 +132,7 @@ class TestQuantAwareCase2(unittest.TestCase):
         def test(program):
             iter = 0
             result = [[], [], []]
-            for data in train_reader():
+            for data in eval_reader():
                 cost, top1, top5 = exe.run(
                     program,
                     feed=feeder.feed(data),
@@ -161,7 +161,8 @@ class TestQuantAwareCase2(unittest.TestCase):
             main_prog, place, config, for_test=False)
         quant_eval_prog = quant_aware(val_prog, place, config, for_test=True)
         train(quant_train_prog)
-        quant_eval_prog = convert(quant_eval_prog, place, config)
+        quant_eval_prog, int8_prog = convert(
+            quant_eval_prog, place, config, save_int8=True)
         top1_2, top5_2 = test(quant_eval_prog)
         # values before quantization and after quantization should be close
         print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
diff --git a/tests/test_quant_post.py b/tests/test_quant_post.py
index 8c916b7b..17068410 100644
--- a/tests/test_quant_post.py
+++ b/tests/test_quant_post.py
@@ -101,12 +101,15 @@ class TestQuantAwareCase1(unittest.TestCase):
             exe,
             './test_quant_post',
             './test_quant_post_inference',
-            paddle.dataset.mnist.test(),
+            sample_generator=paddle.dataset.mnist.test(),
             model_filename='model',
             params_filename='params',
             batch_nums=10)
         quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
-            dirname='./test_quant_post_inference', executor=exe)
+            dirname='./test_quant_post_inference',
+            executor=exe,
+            model_filename='__model__',
+            params_filename='__params__')
         top1_2, top5_2 = test(quant_post_prog, fetch_targets)
         print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
         print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))
-- 
GitLab