Collect weight threshold for lstm op in post_training_quantization (#28701)

* Collect weight threshold of lstm, test=develop

Collect weight threshold for lstm op in post_training_quantization (#28701)
* Collect weight threshold of lstm, test=develop
5d8d463c · cc · GitHub · 11e78eba · 5d8d463c · 5d8d463c
4 changed file
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -29,6 +29,7 @@ from .quantization_pass import _out_scale_op_list
 from .quantization_pass import _get_op_input_var_names
 from .quantization_pass import _get_op_output_var_names
 from .quantization_pass import _get_output_name_index
+from .quantization_pass import _get_input_name_index
 from .quantization_pass import _channelwise_quant_axis1_ops

 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
@@ -253,9 +254,11 @@ class PostTrainingQuantization(object):
        ]
        self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
        self._support_algo_type = ['KL', 'abs_max', 'min_max']
+        self._dynamic_quantize_op_type = ['lstm']
        self._support_quantize_op_type = \
            list(set(QuantizationTransformPass._supported_quantizable_op_type +
-                AddQuantDequantPass._supported_quantizable_op_type))
+                AddQuantDequantPass._supported_quantizable_op_type +
+                self._dynamic_quantize_op_type))

        # Check inputs
        assert executor is not None, "The executor cannot be None."
@@ -381,6 +384,10 @@ class PostTrainingQuantization(object):
            self._save_input_threhold()

        self._save_output_threshold()
+        if any(op_type in self._quantizable_op_type
+               for op_type in self._dynamic_quantize_op_type):
+            self._collect_dynamic_quantize_op_threshold(
+                self._dynamic_quantize_op_type)
        return self._program

    def save_quantized_model(self,
@@ -776,6 +783,34 @@ class PostTrainingQuantization(object):
                for var_name in out_var_names:
                    analysis_and_save_info(op, var_name)

+    def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
+        """
+        Collect and save the weight threshold for dynamic quantize ops,
+        such as lstm and gru.
+        Args:
+            target_ops_type(list): the op type of target ops
+        Returns:
+            None
+        """
+
+        target_ops = []
+        for index in range(self._program.num_blocks):
+            for op in self._program.block(index).ops:
+                if op.type in target_ops_type:
+                    target_ops.append(op)
+
+        quantization_type = str("post_" + self._algo).lower()
+        persistable_var_names = _all_persistable_var_names(self._program)
+        for op in target_ops:
+            for var_name in _get_op_input_var_names(op):
+                if var_name in persistable_var_names:
+                    var_data = _load_variable_data(self._scope, var_name)
+                    threshold = float(np.max(np.abs(var_data)))
+                    argname, index = _get_input_name_index(op, var_name)
+                    op._set_attr(argname + str(index) + "_threshold", threshold)
+                    op._set_attr("quantization_type", quantization_type)
+                    op._set_attr("bit_length", self._weight_bits)
+
    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
        '''
        Using the KL-divergenc method to get the more precise scaling factor.

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -120,6 +120,7 @@ _op_real_in_out_name = {
    "hard_swish": [["X"], ["Out"]],
    "hard_sigmoid": [["X"], ["Out"]],
    "gru": [["Input", "Weight"], ["Hidden"]],
+    "lstm": [["Input", "Weight"], ["Hidden"]],
 }

 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
@@ -144,6 +145,21 @@ def _get_op_input_var_names(op):
    return var_names


+def _get_input_name_index(op, input_var_name):
+    """Get the input name and index of the var_name in the op"""
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    res = None
+    for argname in _op_real_in_out_name[op_name][0]:
+        var_names = op.input(argname)
+        for index, name in enumerate(var_names):
+            if name == input_var_name:
+                res = (argname, index)
+    return res
+
+
 def _get_op_output_var_names(op):
    """ """
    assert isinstance(op, (IrNode, Operator)), \

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -124,6 +124,7 @@ if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
 	list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_quantize_transpiler_v2)
 endif()
@@ -300,8 +301,9 @@ endforeach()

 # setting timeout value for old unittests
 if(NOT WIN32)
+    set_tests_properties(test_post_training_quantization_lstm_model PROPERTIES TIMEOUT 120)
    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY")
-	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY")
+    set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY")
    set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
    set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import struct
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+paddle.enable_static()
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def get_batch_reader(self, data_path, place):
+        def reader():
+            with open(data_path, 'rb') as in_file:
+                while True:
+                    plen = in_file.read(4)
+                    if plen is None or len(plen) != 4:
+                        break
+
+                    alllen = struct.unpack('i', plen)[0]
+                    label_len = alllen & 0xFFFF
+                    seq_len = (alllen >> 16) & 0xFFFF
+
+                    label = in_file.read(4 * label_len)
+                    label = np.frombuffer(
+                        label, dtype=np.int32).reshape([len(label) // 4])
+                    if label.shape[0] != 1 or label[0] > 6350:
+                        continue
+
+                    feat = in_file.read(4 * seq_len * 8)
+                    feat = np.frombuffer(
+                        feat,
+                        dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
+                    lod_feat = [feat.shape[0]]
+
+                    minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
+                    yield [minputs]
+
+        return reader
+
+    def get_simple_reader(self, data_path, place):
+        def reader():
+            with open(data_path, 'rb') as in_file:
+                while True:
+                    plen = in_file.read(4)
+                    if plen is None or len(plen) != 4:
+                        break
+
+                    alllen = struct.unpack('i', plen)[0]
+                    label_len = alllen & 0xFFFF
+                    seq_len = (alllen >> 16) & 0xFFFF
+
+                    label = in_file.read(4 * label_len)
+                    label = np.frombuffer(
+                        label, dtype=np.int32).reshape([len(label) // 4])
+                    if label.shape[0] != 1 or label[0] > 6350:
+                        continue
+
+                    feat = in_file.read(4 * seq_len * 8)
+                    feat = np.frombuffer(
+                        feat,
+                        dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
+                    lod_feat = [feat.shape[0]]
+
+                    minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
+                    yield minputs, label
+
+        return reader
+
+    def run_program(self, model_path, data_path, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path, exe)
+
+        val_reader = self.get_simple_reader(data_path, place)
+
+        all_num = 0
+        right_num = 0
+        periods = []
+        for batch_id, (data, label) in enumerate(val_reader()):
+            t1 = time.time()
+            cls_out, ctc_out = exe.run(infer_program,
+                                       feed={feed_dict[0]: data},
+                                       fetch_list=fetch_targets,
+                                       return_numpy=False)
+            t2 = time.time()
+            periods.append(t2 - t1)
+
+            cls_out = np.array(cls_out).reshape(-1)
+            out_cls_label = np.argmax(cls_out)
+
+            all_num += 1
+            if out_cls_label == label[0]:
+                right_num += 1
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        latency = np.average(periods)
+        acc = right_num / all_num
+        return (latency, acc)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 data_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        batch_generator = self.get_batch_reader(data_path, place)
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            batch_generator=batch_generator,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(self.int8_model_path)
+
+    def run_test(self, model_name, model_url, model_md5, data_name, data_url,
+                 data_md5, algo, quantizable_op_type, is_full_quantize,
+                 is_use_cache_file, is_optimize_model, diff_threshold,
+                 infer_iterations, quant_iterations):
+        fp32_model_path = self.download_model(model_url, model_md5, model_name)
+        fp32_model_path = os.path.join(fp32_model_path, model_name)
+
+        data_path = self.download_model(data_url, data_md5, data_name)
+        data_path = os.path.join(data_path, data_name)
+
+        print("Start FP32 inference for {0} on {1} samples ...".format(
+            model_name, infer_iterations))
+        (fp32_latency, fp32_acc) = self.run_program(fp32_model_path, data_path,
+                                                    infer_iterations)
+
+        print("Start post training quantization for {0} on {1} samples ...".
+              format(model_name, quant_iterations))
+        self.generate_quantized_model(fp32_model_path, data_path, algo,
+                                      quantizable_op_type, is_full_quantize,
+                                      is_use_cache_file, is_optimize_model,
+                                      quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} samples ...".format(
+            model_name, infer_iterations))
+        (int8_latency, int8_acc) = self.run_program(self.int8_model_path,
+                                                    data_path, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print("FP32 {0}: batch_size {1}, latency {2} s, acc {3}.".format(
+            model_name, 1, fp32_latency, fp32_acc))
+        print("INT8 {0}: batch_size {1}, latency {2} s, acc1 {3}.\n".format(
+            model_name, 1, int8_latency, int8_acc))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc - int8_acc
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "nlp_lstm_fp32_model"
+        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
+        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
+        data_name = "quant_lstm_input_data"
+        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
+        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
+        algo = "KL"
+        quantizable_op_type = ["mul", "lstm"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = False
+        diff_threshold = 0.01
+        infer_iterations = 100
+        quant_iterations = 10
+        self.run_test(model_name, model_url, model_md5, data_name, data_url,
+                      data_md5, algo, quantizable_op_type, is_full_quantize,
+                      is_use_cache_file, is_optimize_model, diff_threshold,
+                      infer_iterations, quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()