Add quant2 int8 lstm model test (#35887) (#35912)

Co-authored-by: N joanna.wozna.intel <joanna.wozna@intel.com>

Add quant2 int8 lstm model test (#35887) (#35912)
Co-authored-by: N joanna.wozna.intel <joanna.wozna@intel.com>
e8e77ebe · lidanqing · GitHub · c67cf85d · e8e77ebe · e8e77ebe
4 changed file
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -93,7 +93,8 @@ class Quant2Int8MkldnnPass(object):
        graph = self._dequantize_weights(graph)
        graph = self._optimize_fp32_graph(graph)
        graph = self._compute_weight_scales(graph)
-        graph = self._update_relu_output_scales(graph)
+        # This function causes nondeterministic quantization behavior
+        # graph = self._update_relu_output_scales(graph)
        graph = self._propagate_scales(graph)
        graph = self._quantize_fp32_graph(graph)
        graph = self._final_optimizations(graph)

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -92,17 +92,14 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()

-function(inference_quant2_int8_lstm_model_test target fp32_model dataset_path)
+function(inference_quant2_int8_lstm_model_test target fp32_model quant_model dataset_path)
    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py"
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=true
            ARGS --fp32_model ${fp32_model}
+                 --quant_model ${quant_model}
                 --infer_data ${dataset_path}
-                 --num_threads 4
+                 --num_threads 1
                 --mkldnn_cache_capacity 100
                 --warmup_iter 100
-                 --warmup_batch_size 1
                 --acc_diff_threshold 0.11)
 endfunction()

@@ -293,11 +290,10 @@ if(LINUX AND WITH_MKLDNN)

 	# PTQ int8 lstm model
 	set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz")
-	set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2")
 	download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7)
 	set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
 	download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
-	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
+	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)

 endif()


--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
@@ -20,30 +20,28 @@ import time
 import unittest
 from paddle import fluid
 from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
+from save_quant_model import transform_and_save_int8_model


 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--fp32_model', type=str, default='', help='A path to a FP32 model.')
-    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
    parser.add_argument(
-        '--num_threads', type=int, default=1, help='Number of threads.')
+        '--quant_model', type=str, default='', help='A path to a quant model.')
+    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
    parser.add_argument(
        '--warmup_iter',
        type=int,
        default=1,
        help='Number of the first iterations to skip in performance statistics.')
-    parser.add_argument(
-        '--warmup_batch_size',
-        type=int,
-        default=1,
-        help='Number of batches to use in PTQ warmup. Default: 1.')
    parser.add_argument(
        '--acc_diff_threshold',
        type=float,
        default=0.01,
        help='Accepted accuracy difference threshold.')
+    parser.add_argument(
+        '--num_threads', type=int, default=1, help='Number of threads.')
    parser.add_argument(
        '--mkldnn_cache_capacity',
        type=int,
@@ -56,7 +54,7 @@ def parse_args():


 class TestLstmModelPTQ(unittest.TestCase):
-    def get_warmup_tensor(self, data_path, place, warmup_batch_size):
+    def get_warmup_tensor(self, data_path, place):
        data = []
        with open(data_path, 'rb') as in_f:
            while True:
@@ -87,30 +85,31 @@ class TestLstmModelPTQ(unittest.TestCase):
                infer_label.shape = label.shape
                infer_label.dtype = fluid.core.PaddleDType.INT32
                data.append([infer_data, infer_label])
-        warmup_data = data[:warmup_batch_size]
-        inputs = data[warmup_batch_size:]
+        warmup_data = data[:1]
+        inputs = data[1:]
        return warmup_data, inputs

    def set_config(self,
                   model_path,
                   num_threads,
                   mkldnn_cache_capacity,
-                   warmup_batch_size,
                   warmup_data=None,
-                   enable_int8=False):
+                   use_analysis=False,
+                   enable_ptq=False):
        config = AnalysisConfig(model_path)
+        config.set_cpu_math_library_num_threads(num_threads)
+        if use_analysis:
            config.disable_gpu()
            config.switch_use_feed_fetch_ops(True)
            config.switch_ir_optim(True)
-        config.set_cpu_math_library_num_threads(num_threads)
-        # This pass to work properly, must be added before fc_fuse_pass
-        config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
            config.enable_mkldnn()
            config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
-        if enable_int8:
+            if enable_ptq:
+                # This pass to work properly, must be added before fc_fuse_pass
+                config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
                config.enable_quantizer()
                config.quantizer_config().set_quant_data(warmup_data)
-            config.quantizer_config().set_quant_batch_size(warmup_batch_size)
+                config.quantizer_config().set_quant_batch_size(1)
        return config

    def run_program(self,
@@ -119,15 +118,13 @@ class TestLstmModelPTQ(unittest.TestCase):
                    num_threads,
                    mkldnn_cache_capacity,
                    warmup_iter,
-                    warmup_batch_size,
-                    enable_ptq_int8=False):
+                    use_analysis=False,
+                    enable_ptq=False):
        place = fluid.CPUPlace()
-        warmup_data, inputs = self.get_warmup_tensor(data_path, place,
-                                                     warmup_batch_size)
+        warmup_data, inputs = self.get_warmup_tensor(data_path, place)
        warmup_data = [item[0] for item in warmup_data]
        config = self.set_config(model_path, num_threads, mkldnn_cache_capacity,
-                                 warmup_batch_size, warmup_data,
-                                 enable_ptq_int8)
+                                 warmup_data, use_analysis, enable_ptq)

        predictor = create_paddle_predictor(config)
        data = [item[0] for item in inputs]
@@ -183,34 +180,47 @@ class TestLstmModelPTQ(unittest.TestCase):

        fp32_model = test_case_args.fp32_model
        assert fp32_model, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.'
+        quant_model = test_case_args.quant_model
+        assert quant_model, 'The quant model path cannot be empty. Please, use the --quant_model option.'
        infer_data = test_case_args.infer_data
        assert infer_data, 'The dataset path cannot be empty. Please, use the --infer_data option.'
        num_threads = test_case_args.num_threads
        mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity
        warmup_iter = test_case_args.warmup_iter
-        warmup_batch_size = test_case_args.warmup_batch_size
        acc_diff_threshold = test_case_args.acc_diff_threshold

        (fp32_hx_acc, fp32_ctc_acc, fp32_fps) = self.run_program(
            fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, warmup_batch_size, False)
+            warmup_iter, False, False)

        (int8_hx_acc, int8_ctc_acc, int8_fps) = self.run_program(
            fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, warmup_batch_size, True)
+            warmup_iter, True, True)
+
+        quant_model_save_path = quant_model + "_int8"
+        # transform model to quant2
+        transform_and_save_int8_model(quant_model, quant_model_save_path,
+                                      "fusion_lstm,concat")

-        print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
+        (quant_hx_acc, quant_ctc_acc, quant_fps) = self.run_program(
+            quant_model_save_path, infer_data, num_threads,
+            mkldnn_cache_capacity, warmup_iter, True, False)
+
+        print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}".format(
            fp32_fps, fp32_hx_acc, fp32_ctc_acc))

-        print("PTQ INT8: fps {0}, hx_acc {1}, ctc_acc {2}.".format(
+        print("PTQ_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format(
            int8_fps, int8_hx_acc, int8_ctc_acc))

+        print("QUANT2_INT8: fps {0}, hx_acc {1}, ctc_acc {2}".format(
+            quant_fps, quant_hx_acc, quant_ctc_acc))
+
        sys.stdout.flush()

-        hx_delta_value = fp32_hx_acc - int8_hx_acc
-        ctc_delta_value = fp32_ctc_acc - int8_ctc_acc
-        self.assertLess(hx_delta_value, acc_diff_threshold)
-        self.assertLess(ctc_delta_value, acc_diff_threshold)
+        self.assertLess(fp32_hx_acc - int8_hx_acc, acc_diff_threshold)
+        self.assertLess(fp32_ctc_acc - int8_ctc_acc, acc_diff_threshold)
+        self.assertLess(fp32_hx_acc - quant_hx_acc, acc_diff_threshold)
+        self.assertLess(fp32_ctc_acc - quant_ctc_acc, acc_diff_threshold)


 if __name__ == "__main__":

--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -16,11 +16,6 @@ import unittest
 import os
 import sys
 import argparse
-import logging
-import struct
-import six
-import numpy as np
-import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
@@ -62,7 +57,11 @@ def parse_args():
    return test_args, sys.argv[:1] + args


-def transform_and_save_int8_model(original_path, save_path):
+def transform_and_save_int8_model(original_path,
+                                  save_path,
+                                  ops_to_quantize='',
+                                  op_ids_to_skip='',
+                                  debug=False):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_scope = fluid.executor.global_scope()
@@ -75,24 +74,26 @@ def transform_and_save_int8_model(original_path, save_path):
             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
                                                            'model', 'params')

-        ops_to_quantize = set()
-        if len(test_args.ops_to_quantize) > 0:
-            ops_to_quantize = set(test_args.ops_to_quantize.split(','))
+        ops_to_quantize_set = set()
+        print(ops_to_quantize)
+        if len(ops_to_quantize) > 0:
+            ops_to_quantize_set = set(ops_to_quantize.split(','))

-        op_ids_to_skip = set([-1])
-        if len(test_args.op_ids_to_skip) > 0:
-            op_ids_to_skip = set(map(int, test_args.op_ids_to_skip.split(',')))
+        op_ids_to_skip_set = set([-1])
+        print(op_ids_to_skip)
+        if len(op_ids_to_skip) > 0:
+            op_ids_to_skip_set = set(map(int, op_ids_to_skip.split(',')))

        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
-        if (test_args.debug):
+        if (debug):
            graph.draw('.', 'quant_orig', graph.all_op_nodes())
        transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass(
-            ops_to_quantize,
-            _op_ids_to_skip=op_ids_to_skip,
+            ops_to_quantize_set,
+            _op_ids_to_skip=op_ids_to_skip_set,
            _scope=inference_scope,
            _place=place,
            _core=core,
-            _debug=test_args.debug)
+            _debug=debug)
        graph = transform_to_mkldnn_int8_pass.apply(graph)
        inference_program = graph.to_program()
        with fluid.scope_guard(inference_scope):
@@ -106,5 +107,6 @@ def transform_and_save_int8_model(original_path, save_path):
 if __name__ == '__main__':
    global test_args
    test_args, remaining_args = parse_args()
-    transform_and_save_int8_model(test_args.quant_model_path,
-                                  test_args.int8_model_save_path)
+    transform_and_save_int8_model(
+        test_args.quant_model_path, test_args.int8_model_save_path,
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)