From 8b74fc4fa78eb13aec82255acde8a55514a012c4 Mon Sep 17 00:00:00 2001
From: juncaipeng <52520497+juncaipeng@users.noreply.github.com>
Date: Thu, 19 Dec 2019 14:10:52 +0800
Subject: [PATCH] Fix post training quantization (#21745)

* fix post training quantization bug of memory constrained, support the input be different, test=develop
---
 .../post_training_quantization.py             | 98 ++++++++++++++-----
 ..._post_training_quantization_mobilenetv1.py | 18 ++--
 ...est_post_training_quantization_resnet50.py |  5 +-
 3 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 117de330cd..cc5e87b22e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
+import re
 import logging
 import numpy as np
 from ....executor import global_scope
@@ -43,7 +45,9 @@ class PostTrainingQuantization(object):
                  scope=None,
                  algo="KL",
                  quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
-                 is_full_quantize=False):
+                 is_full_quantize=False,
+                 is_use_cache_file=False,
+                 cache_dir="./temp_post_training"):
         '''
         The class utilizes post training quantization methon to quantize the 
         fp32 model. It uses calibrate data to calculate the scale factor of 
@@ -78,9 +82,16 @@ class PostTrainingQuantization(object):
                 that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                 "mul"].
             is_full_quantized(bool, optional): If set is_full_quantized as True, 
-                apply quantization to all supported quantizable op type. If set 
+                apply quantization to all supported quantizable op type. If set
                 is_full_quantized as False, only apply quantization to the op type 
                 according to the input quantizable_op_type.
+            is_use_cache_file(bool, optional): If set is_use_cache_file as False,
+                all temp data will be saved in memory. If set is_use_cache_file as True,
+                it will save temp data to disk. When the fp32 model is complex or
+                the number of calibrate data is large, we should set is_use_cache_file
+                as True. Defalut is False.
+            cache_dir(str, optional): When is_use_cache_file is True, set cache_dir as
+                the directory for saving temp data. Default is ./temp_post_training.
         Returns:
             None
 
@@ -129,6 +140,10 @@ class PostTrainingQuantization(object):
         self._batch_nums = batch_nums
         self._scope = global_scope() if scope == None else scope
         self._algo = algo
+        self._is_use_cache_file = is_use_cache_file
+        self._cache_dir = cache_dir
+        if self._is_use_cache_file and not os.path.exists(self._cache_dir):
+            os.mkdir(self._cache_dir)
 
         supported_quantizable_op_type = \
             QuantizationTransformPass._supported_quantizable_op_type + \
@@ -150,8 +165,8 @@ class PostTrainingQuantization(object):
 
         self._op_real_in_out_name = _op_real_in_out_name
         self._bit_length = 8
-        self._quantized_weight_var_name = []
-        self._quantized_act_var_name = []
+        self._quantized_weight_var_name = set()
+        self._quantized_act_var_name = set()
         self._sampling_data = {}
         self._quantized_var_scale_factor = {}
 
@@ -174,7 +189,8 @@ class PostTrainingQuantization(object):
                                feed=data,
                                fetch_list=self._fetch_list,
                                return_numpy=False)
-            self._sample_data()
+            self._sample_data(batch_id)
+
             if batch_id % 5 == 0:
                 _logger.info("run batch: " + str(batch_id))
             batch_id += 1
@@ -238,10 +254,9 @@ class PostTrainingQuantization(object):
             op_type = op.type
             if op_type in self._quantizable_op_type:
                 if op_type in ("conv2d", "depthwise_conv2d"):
-                    self._quantized_act_var_name.append(op.input("Input")[0])
-                    self._quantized_weight_var_name.append(
-                        op.input("Filter")[0])
-                    self._quantized_act_var_name.append(op.output("Output")[0])
+                    self._quantized_act_var_name.add(op.input("Input")[0])
+                    self._quantized_weight_var_name.add(op.input("Filter")[0])
+                    self._quantized_act_var_name.add(op.output("Output")[0])
                 elif op_type == "mul":
                     if self._is_input_all_not_persistable(
                             op, persistable_var_names):
@@ -249,9 +264,9 @@ class PostTrainingQuantization(object):
                         _logger.warning("Skip quant a mul op for two "
                                         "input variables are not persistable")
                     else:
-                        self._quantized_act_var_name.append(op.input("X")[0])
-                        self._quantized_weight_var_name.append(op.input("Y")[0])
-                        self._quantized_act_var_name.append(op.output("Out")[0])
+                        self._quantized_act_var_name.add(op.input("X")[0])
+                        self._quantized_weight_var_name.add(op.input("Y")[0])
+                        self._quantized_act_var_name.add(op.output("Out")[0])
                 else:
                     # process other quantizable op type, the input must all not persistable
                     if self._is_input_all_not_persistable(
@@ -260,10 +275,10 @@ class PostTrainingQuantization(object):
                             op_type]
                         for input_name in input_output_name_list[0]:
                             for var_name in op.input(input_name):
-                                self._quantized_act_var_name.append(var_name)
+                                self._quantized_act_var_name.add(var_name)
                         for output_name in input_output_name_list[1]:
                             for var_name in op.output(output_name):
-                                self._quantized_act_var_name.append(var_name)
+                                self._quantized_act_var_name.add(var_name)
 
         # set activation variables to be persistable, so can obtain 
         # the tensor data in sample_data
@@ -271,7 +286,7 @@ class PostTrainingQuantization(object):
             if var.name in self._quantized_act_var_name:
                 var.persistable = True
 
-    def _sample_data(self):
+    def _sample_data(self, iter):
         '''
         Sample the tensor data of quantized variables, 
         applied in every iteration.
@@ -281,11 +296,20 @@ class PostTrainingQuantization(object):
                 var_tensor = self._load_var_value(var_name)
                 self._sampling_data[var_name] = var_tensor
 
-        for var_name in self._quantized_act_var_name:
-            if var_name not in self._sampling_data:
-                self._sampling_data[var_name] = []
-            var_tensor = self._load_var_value(var_name)
-            self._sampling_data[var_name].append(var_tensor)
+        if self._is_use_cache_file:
+            for var_name in self._quantized_act_var_name:
+                var_tensor = self._load_var_value(var_name)
+                var_tensor = var_tensor.ravel()
+                save_path = os.path.join(self._cache_dir,
+                                         var_name + "_" + str(iter) + ".npy")
+                np.save(save_path, var_tensor)
+        else:
+            for var_name in self._quantized_act_var_name:
+                if var_name not in self._sampling_data:
+                    self._sampling_data[var_name] = []
+                var_tensor = self._load_var_value(var_name)
+                var_tensor = var_tensor.ravel()
+                self._sampling_data[var_name].append(var_tensor)
 
     def _calculate_scale_factor(self):
         '''
@@ -302,13 +326,33 @@ class PostTrainingQuantization(object):
                 var_name] = scale_factor_per_channel
 
         # apply kl quantization for activation
-        for var_name in self._quantized_act_var_name:
-            if self._algo == "KL":
-                self._quantized_var_scale_factor[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name]))
-            else:
-                self._quantized_var_scale_factor[var_name] = \
-                    np.max(np.abs(self._sampling_data[var_name]))
+        if self._is_use_cache_file:
+            for var_name in self._quantized_act_var_name:
+                sampling_data = []
+                filenames = [f for f in os.listdir(self._cache_dir) \
+                    if re.match(var_name + '_[0-9]+.npy', f)]
+                for filename in filenames:
+                    file_path = os.path.join(self._cache_dir, filename)
+                    sampling_data.append(np.load(file_path))
+                    os.remove(file_path)
+                sampling_data = np.concatenate(sampling_data)
+
+                if self._algo == "KL":
+                    self._quantized_var_scale_factor[var_name] = \
+                        self._get_kl_scaling_factor(np.abs(sampling_data))
+                else:
+                    self._quantized_var_scale_factor[var_name] = \
+                        np.max(np.abs(sampling_data))
+        else:
+            for var_name in self._quantized_act_var_name:
+                self._sampling_data[var_name] = np.concatenate(
+                    self._sampling_data[var_name])
+                if self._algo == "KL":
+                    self._quantized_var_scale_factor[var_name] = \
+                        self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name]))
+                else:
+                    self._quantized_var_scale_factor[var_name] = \
+                        np.max(np.abs(self._sampling_data[var_name]))
 
     def _update_program(self):
         '''
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 5180723da4..0cd804cc51 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -237,7 +237,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
     def generate_quantized_model(self,
                                  model_path,
                                  algo="KL",
-                                 is_full_quantize=False):
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False):
         try:
             os.system("mkdir " + self.int8_model)
         except Exception as e:
@@ -259,11 +260,13 @@ class TestPostTrainingQuantization(unittest.TestCase):
             model_dir=model_path,
             algo=algo,
             quantizable_op_type=quantizable_op_type,
-            is_full_quantize=is_full_quantize)
+            is_full_quantize=is_full_quantize,
+            is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model)
 
-    def run_test(self, model, algo, data_urls, data_md5s):
+    def run_test(self, model, algo, data_urls, data_md5s, is_full_quantize,
+                 is_use_cache_file):
         infer_iterations = self.infer_iterations
         batch_size = self.batch_size
         sample_iterations = self.sample_iterations
@@ -277,8 +280,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model, sample_iterations * batch_size))
-        self.generate_quantized_model(
-            model_cache_folder + "/model", algo=algo, is_full_quantize=True)
+        self.generate_quantized_model(model_cache_folder + "/model", algo,
+                                      is_full_quantize, is_use_cache_file)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
@@ -305,7 +308,10 @@ class TestPostTrainingForMobilenetv1(TestPostTrainingQuantization):
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
         data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        self.run_test(model, algo, data_urls, data_md5s)
+        is_full_quantize = True
+        is_use_cache_file = False
+        self.run_test(model, algo, data_urls, data_md5s, is_full_quantize,
+                      is_use_cache_file)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
index e93d68e857..815f2e4332 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -25,7 +25,10 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
             'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
         ]
         data_md5s = ['4a5194524823d9b76da6e738e1367881']
-        self.run_test(model, algo, data_urls, data_md5s)
+        is_full_quantize = False
+        is_use_cache_file = True
+        self.run_test(model, algo, data_urls, data_md5s, is_full_quantize,
+                      is_use_cache_file)
 
 
 if __name__ == '__main__':
-- 
GitLab