diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index ddbd99e16cebdfc839a8e96e44d4f96f02e70c55..00aca7744e4f6a9178b932bcd22a8899ea0f6112 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -29,6 +29,7 @@ from .quantization_pass import _out_scale_op_list from .quantization_pass import _get_op_input_var_names from .quantization_pass import _get_op_output_var_names from .quantization_pass import _get_output_name_index +from .quantization_pass import _get_input_name_index from .quantization_pass import _channelwise_quant_axis1_ops __all__ = ['PostTrainingQuantization', 'WeightQuantization'] @@ -253,9 +254,11 @@ class PostTrainingQuantization(object): ] self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max'] self._support_algo_type = ['KL', 'abs_max', 'min_max'] + self._dynamic_quantize_op_type = ['lstm'] self._support_quantize_op_type = \ list(set(QuantizationTransformPass._supported_quantizable_op_type + - AddQuantDequantPass._supported_quantizable_op_type)) + AddQuantDequantPass._supported_quantizable_op_type + + self._dynamic_quantize_op_type)) # Check inputs assert executor is not None, "The executor cannot be None." @@ -381,6 +384,10 @@ class PostTrainingQuantization(object): self._save_input_threhold() self._save_output_threshold() + if any(op_type in self._quantizable_op_type + for op_type in self._dynamic_quantize_op_type): + self._collect_dynamic_quantize_op_threshold( + self._dynamic_quantize_op_type) return self._program def save_quantized_model(self, @@ -776,6 +783,34 @@ class PostTrainingQuantization(object): for var_name in out_var_names: analysis_and_save_info(op, var_name) + def _collect_dynamic_quantize_op_threshold(self, target_ops_type): + """ + Collect and save the weight threshold for dynamic quantize ops, + such as lstm and gru. + Args: + target_ops_type(list): the op type of target ops + Returns: + None + """ + + target_ops = [] + for index in range(self._program.num_blocks): + for op in self._program.block(index).ops: + if op.type in target_ops_type: + target_ops.append(op) + + quantization_type = str("post_" + self._algo).lower() + persistable_var_names = _all_persistable_var_names(self._program) + for op in target_ops: + for var_name in _get_op_input_var_names(op): + if var_name in persistable_var_names: + var_data = _load_variable_data(self._scope, var_name) + threshold = float(np.max(np.abs(var_data))) + argname, index = _get_input_name_index(op, var_name) + op._set_attr(argname + str(index) + "_threshold", threshold) + op._set_attr("quantization_type", quantization_type) + op._set_attr("bit_length", self._weight_bits) + def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255): ''' Using the KL-divergenc method to get the more precise scaling factor. diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 0017c29cbda24ea28537e55ccb54f4f1f194c662..1cf39dde91e6bdbda6dc0079beefc730a845c73a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -120,6 +120,7 @@ _op_real_in_out_name = { "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], + "lstm": [["Input", "Weight"], ["Hidden"]], } _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] @@ -144,6 +145,21 @@ def _get_op_input_var_names(op): return var_names +def _get_input_name_index(op, input_var_name): + """Get the input name and index of the var_name in the op""" + assert isinstance(op, (IrNode, Operator)), \ + "The input op should be IrNode or Operator." + op_name = op.name() if isinstance(op, IrNode) \ + else op.type + res = None + for argname in _op_real_in_out_name[op_name][0]: + var_names = op.input(argname) + for index, name in enumerate(var_names): + if name == input_var_name: + res = (argname, index) + return res + + def _get_op_output_var_names(op): """ """ assert isinstance(op, (IrNode, Operator)), \ diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 6a81597356ea9f3fca99edafab98989e9a8ca0ce..c4b90565a0924e78ae15a52a88ea8ad7ab2736d0 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -124,6 +124,7 @@ if(WIN32) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50) + list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model) list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1) list(REMOVE_ITEM TEST_OPS test_quantize_transpiler_v2) endif() @@ -300,8 +301,9 @@ endforeach() # setting timeout value for old unittests if(NOT WIN32) + set_tests_properties(test_post_training_quantization_lstm_model PROPERTIES TIMEOUT 120) set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY") - set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY") + set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY") set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120) set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8a28ee7983e6a7e363ba98cdabd34b0fd7f1fb17 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py @@ -0,0 +1,256 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. +import unittest +import os +import time +import sys +import random +import math +import functools +import contextlib +import struct +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.dataset.common import download +from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization + +paddle.enable_static() + +random.seed(0) +np.random.seed(0) + + +class TestPostTrainingQuantization(unittest.TestCase): + def setUp(self): + self.download_path = 'int8/download' + self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + + self.download_path) + self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + self.int8_model_path = os.path.join(os.getcwd(), + "post_training_" + self.timestamp) + try: + os.system("mkdir -p " + self.int8_model_path) + except Exception as e: + print("Failed to create {} due to {}".format(self.int8_model_path, + str(e))) + sys.exit(-1) + + def tearDown(self): + try: + os.system("rm -rf {}".format(self.int8_model_path)) + except Exception as e: + print("Failed to delete {} due to {}".format(self.int8_model_path, + str(e))) + + def cache_unzipping(self, target_folder, zip_path): + if not os.path.exists(target_folder): + cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, + zip_path) + os.system(cmd) + + def download_model(self, data_url, data_md5, folder_name): + download(data_url, self.download_path, data_md5) + file_name = data_url.split('/')[-1] + zip_path = os.path.join(self.cache_folder, file_name) + print('Data is downloaded at {0}'.format(zip_path)) + + data_cache_folder = os.path.join(self.cache_folder, folder_name) + self.cache_unzipping(data_cache_folder, zip_path) + return data_cache_folder + + def get_batch_reader(self, data_path, place): + def reader(): + with open(data_path, 'rb') as in_file: + while True: + plen = in_file.read(4) + if plen is None or len(plen) != 4: + break + + alllen = struct.unpack('i', plen)[0] + label_len = alllen & 0xFFFF + seq_len = (alllen >> 16) & 0xFFFF + + label = in_file.read(4 * label_len) + label = np.frombuffer( + label, dtype=np.int32).reshape([len(label) // 4]) + if label.shape[0] != 1 or label[0] > 6350: + continue + + feat = in_file.read(4 * seq_len * 8) + feat = np.frombuffer( + feat, + dtype=np.float32).reshape([len(feat) // 4 // 8, 8]) + lod_feat = [feat.shape[0]] + + minputs = fluid.create_lod_tensor(feat, [lod_feat], place) + yield [minputs] + + return reader + + def get_simple_reader(self, data_path, place): + def reader(): + with open(data_path, 'rb') as in_file: + while True: + plen = in_file.read(4) + if plen is None or len(plen) != 4: + break + + alllen = struct.unpack('i', plen)[0] + label_len = alllen & 0xFFFF + seq_len = (alllen >> 16) & 0xFFFF + + label = in_file.read(4 * label_len) + label = np.frombuffer( + label, dtype=np.int32).reshape([len(label) // 4]) + if label.shape[0] != 1 or label[0] > 6350: + continue + + feat = in_file.read(4 * seq_len * 8) + feat = np.frombuffer( + feat, + dtype=np.float32).reshape([len(feat) // 4 // 8, 8]) + lod_feat = [feat.shape[0]] + + minputs = fluid.create_lod_tensor(feat, [lod_feat], place) + yield minputs, label + + return reader + + def run_program(self, model_path, data_path, infer_iterations): + print("test model path:" + model_path) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + [infer_program, feed_dict, fetch_targets] = \ + fluid.io.load_inference_model(model_path, exe) + + val_reader = self.get_simple_reader(data_path, place) + + all_num = 0 + right_num = 0 + periods = [] + for batch_id, (data, label) in enumerate(val_reader()): + t1 = time.time() + cls_out, ctc_out = exe.run(infer_program, + feed={feed_dict[0]: data}, + fetch_list=fetch_targets, + return_numpy=False) + t2 = time.time() + periods.append(t2 - t1) + + cls_out = np.array(cls_out).reshape(-1) + out_cls_label = np.argmax(cls_out) + + all_num += 1 + if out_cls_label == label[0]: + right_num += 1 + + if (batch_id + 1) == infer_iterations: + break + + latency = np.average(periods) + acc = right_num / all_num + return (latency, acc) + + def generate_quantized_model(self, + model_path, + data_path, + algo="KL", + quantizable_op_type=["conv2d"], + is_full_quantize=False, + is_use_cache_file=False, + is_optimize_model=False, + batch_size=10, + batch_nums=10): + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + scope = fluid.global_scope() + batch_generator = self.get_batch_reader(data_path, place) + + ptq = PostTrainingQuantization( + executor=exe, + model_dir=model_path, + batch_generator=batch_generator, + batch_nums=batch_nums, + algo=algo, + quantizable_op_type=quantizable_op_type, + is_full_quantize=is_full_quantize, + optimize_model=is_optimize_model, + is_use_cache_file=is_use_cache_file) + ptq.quantize() + ptq.save_quantized_model(self.int8_model_path) + + def run_test(self, model_name, model_url, model_md5, data_name, data_url, + data_md5, algo, quantizable_op_type, is_full_quantize, + is_use_cache_file, is_optimize_model, diff_threshold, + infer_iterations, quant_iterations): + fp32_model_path = self.download_model(model_url, model_md5, model_name) + fp32_model_path = os.path.join(fp32_model_path, model_name) + + data_path = self.download_model(data_url, data_md5, data_name) + data_path = os.path.join(data_path, data_name) + + print("Start FP32 inference for {0} on {1} samples ...".format( + model_name, infer_iterations)) + (fp32_latency, fp32_acc) = self.run_program(fp32_model_path, data_path, + infer_iterations) + + print("Start post training quantization for {0} on {1} samples ...". + format(model_name, quant_iterations)) + self.generate_quantized_model(fp32_model_path, data_path, algo, + quantizable_op_type, is_full_quantize, + is_use_cache_file, is_optimize_model, + quant_iterations) + + print("Start INT8 inference for {0} on {1} samples ...".format( + model_name, infer_iterations)) + (int8_latency, int8_acc) = self.run_program(self.int8_model_path, + data_path, infer_iterations) + + print("---Post training quantization of {} method---".format(algo)) + print("FP32 {0}: batch_size {1}, latency {2} s, acc {3}.".format( + model_name, 1, fp32_latency, fp32_acc)) + print("INT8 {0}: batch_size {1}, latency {2} s, acc1 {3}.\n".format( + model_name, 1, int8_latency, int8_acc)) + sys.stdout.flush() + + delta_value = fp32_acc - int8_acc + self.assertLess(delta_value, diff_threshold) + + +class TestPostTrainingKLForMnist(TestPostTrainingQuantization): + def test_post_training_kl(self): + model_name = "nlp_lstm_fp32_model" + model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz" + model_md5 = "519b8eeac756e7b4b7bcb2868e880452" + data_name = "quant_lstm_input_data" + data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz" + data_md5 = "add84c754e9b792fea1fbd728d134ab7" + algo = "KL" + quantizable_op_type = ["mul", "lstm"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = False + diff_threshold = 0.01 + infer_iterations = 100 + quant_iterations = 10 + self.run_test(model_name, model_url, model_md5, data_name, data_url, + data_md5, algo, quantizable_op_type, is_full_quantize, + is_use_cache_file, is_optimize_model, diff_threshold, + infer_iterations, quant_iterations) + + +if __name__ == '__main__': + unittest.main()