From ad2813b1251efb759f5061fc23fe520b13429dc6 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Tue, 11 Feb 2020 18:48:10 +0800 Subject: [PATCH] [cherry-pick] Add weight quantization in post_training_quanzitaion (#22445) (#22493) * Add weight quantization in post_training_quanzitaion (#22445) * [cherry-pick]Support int16 for Tensor (#22423) * add int16 support, test=develop, test=release/1.7 Co-authored-by: Leo Chen --- paddle/fluid/operators/save_op.cc | 1 + paddle/fluid/pybind/tensor_py.h | 18 +- .../post_training_quantization.py | 189 +++++++++++++++--- .../fluid/contrib/slim/tests/CMakeLists.txt | 1 + .../test_weight_quantization_mobilenetv1.py | 91 +++++++++ .../fluid/tests/unittests/test_tensor.py | 50 +++-- 6 files changed, 306 insertions(+), 44 deletions(-) create mode 100644 python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 09b171fe90..8a41d79433 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -90,4 +90,5 @@ REGISTER_OP_CPU_KERNEL( ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, + ops::SaveOpKernel, ops::SaveOpKernel); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index cc44ad9a2d..9e5dc63851 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -106,9 +106,10 @@ DECLARE_VALID_DTYPE_TO_PY_ARRAY(float); DECLARE_VALID_DTYPE_TO_PY_ARRAY(double); DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool); DECLARE_VALID_DTYPE_TO_PY_ARRAY(int8_t); -DECLARE_VALID_DTYPE_TO_PY_ARRAY(uint8_t); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(int16_t); DECLARE_VALID_DTYPE_TO_PY_ARRAY(int); DECLARE_VALID_DTYPE_TO_PY_ARRAY(int64_t); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(uint8_t); inline std::string TensorDTypeToPyDTypeStr( framework::proto::VarType::Type type) { @@ -218,13 +219,16 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); + } else if (py::isinstance>(array)) { + SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { - // TODO(cql): temporary keeping uint16, should be depracated later + // TODO(cql): temporary keeping uint16, which is used for casting float16 + // before. It should be depracated later. SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { @@ -234,7 +238,7 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, "Incompatible data or style type: tensor.set() supports bool, float16, " "float32, " "float64, " - "int8, int32, int64 and uint8, uint16, but got %s!", + "int8, int16, int32, int64 and uint8, uint16, but got %s!", array.dtype()); } } @@ -435,16 +439,18 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self, return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP64: return _sliceAndConcat(self, obj, dim); + case framework::proto::VarType::INT8: + return _sliceAndConcat(self, obj, dim); + case framework::proto::VarType::INT16: + return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::INT32: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::INT64: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::BOOL: return _sliceAndConcat(self, obj, dim); - case framework::proto::VarType::INT16: - return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::UINT8: - return _sliceAndConcat(self, obj, dim); + return _sliceAndConcat(self, obj, dim); default: PADDLE_THROW("Not support type %d", src_type); } diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index d0d69ae91a..ae2298e10a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -16,10 +16,10 @@ import os import re import logging import numpy as np -from ....executor import global_scope from .... import io from .... import core from .... import framework +from ....executor import global_scope, Executor from ....framework import IrGraph from ....log_helper import get_logger from .quantization_pass import QuantizationTransformPass @@ -27,12 +27,31 @@ from .quantization_pass import QuantizationFreezePass from .quantization_pass import AddQuantDequantPass from .quantization_pass import _op_real_in_out_name -__all__ = ['PostTrainingQuantization'] +__all__ = ['PostTrainingQuantization', 'WeightQuantization'] _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') +def _load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + return np.array(scope.find_var(var_name).get_tensor()) + + +def _set_variable_data(scope, place, var_name, np_value): + ''' + Set the value of var node by name, if the node exits, + ''' + assert isinstance(np_value, np.ndarray), \ + 'The type of value should be numpy array.' + var_node = scope.find_var(var_name) + if var_node != None: + tensor = var_node.get_tensor() + tensor.set(np_value, place) + + class PostTrainingQuantization(object): def __init__(self, executor, @@ -297,12 +316,12 @@ class PostTrainingQuantization(object): ''' for var_name in self._quantized_weight_var_name: if var_name not in self._sampling_data: - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) self._sampling_data[var_name] = var_tensor if self._is_use_cache_file: for var_name in self._quantized_act_var_name: - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.ravel() save_path = os.path.join(self._cache_dir, var_name + "_" + str(iter) + ".npy") @@ -311,7 +330,7 @@ class PostTrainingQuantization(object): for var_name in self._quantized_act_var_name: if var_name not in self._sampling_data: self._sampling_data[var_name] = [] - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.ravel() self._sampling_data[var_name].append(var_tensor) @@ -397,11 +416,17 @@ class PostTrainingQuantization(object): # save scale factor to scale var node for key, val in self._quantized_var_scale_factor.items(): - self._set_var_node_value( - key + ".scale", np.array( + _set_variable_data( + self._scope, + self._place, + key + ".scale", + np.array( [val], dtype=np.float32)) - self._set_var_node_value( - key + ".quant_dequant.scale", np.array( + _set_variable_data( + self._scope, + self._place, + key + ".quant_dequant.scale", + np.array( [val], dtype=np.float32)) # apply QuantizationFreezePass, and obtain the final quant model @@ -430,23 +455,6 @@ class PostTrainingQuantization(object): self._quantized_var_scale_factor[ output_var_name]) - def _load_var_value(self, var_name): - ''' - Load variable value from scope - ''' - return np.array(self._scope.find_var(var_name).get_tensor()) - - def _set_var_node_value(self, var_node_name, np_value): - ''' - Set the value of var node by name, if the node exits, - ''' - assert isinstance(np_value, np.ndarray), \ - 'The type of value should be numpy array.' - var_node = self._scope.find_var(var_node_name) - if var_node != None: - tensor = var_node.get_tensor() - tensor.set(np_value, self._place) - def _is_input_all_not_persistable(self, op, persistable_var_names): ''' Analyze the real inputs of the op are all not persistable. @@ -566,3 +574,132 @@ class PostTrainingQuantization(object): tmp_sum1 += p_idx * (math.log(Q_sum * p_idx)) tmp_sum2 += p_idx * (math.log(P_sum * q_idx)) return (tmp_sum1 - tmp_sum2) / P_sum + + +class WeightQuantization(object): + _supported_quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + + def __init__(self, model_dir, model_filename=None, params_filename=None): + ''' + This class quantizes the weight of some ops to reduce the size of model + or improve the perforemace. + + Args: + model_dir(str): The path of the fp32 model that will be quantized, + and the model and params files are under the path. + model_filename(str, optional): The name of file to load the inference + program. If it is None, the default filename '__model__' will + be used. Default is 'None'. + params_filename(str, optional): The name of file to load all parameters. + When all parameters were saved in a single binary file, set it + as the real filename. If parameters were saved in separate files, + set it as 'None'. Default is 'None'. + ''' + self._model_dir = model_dir + self._model_filename = model_filename + self._params_filename = params_filename + + def quantize_weight_to_int(self, + save_model_dir, + save_model_filename=None, + save_params_filename=None, + quantizable_op_type=["conv2d", "mul"], + quantize_weight_bits=8, + threshold_rate=0.0): + ''' + In order to reduce the size of model, this api quantizes the weight + of some ops from float32 to int8/16. In the inference stage, the + quantized weight will be dequantized to float32 again. + + Args: + save_model_dir(str): The path to save the quantized model. + save_model_filename(str, optional): The name of file to + save the inference program. If it is None, the default + filename '__model__' will be used. Default is 'None'. + save_params_filename(str, optional): The name of file to + save all parameters. If it is None, parameters were + saved in separate files. If it is not None, all + parameters were saved in a single binary file. + quantizable_op_type(list[str], optional): The list of ops + that will be quantized, and the quantized ops should be + contained in ["conv2d", "depthwise_conv2d", "mul"]. + Default is ["conv2d","mul"]. + quantize_weight_bits(int, optional): The bits for the quantized + weight, and it should be 8 or 16. Default is 8. + threshold_rate(float, optional): This api uses abs_max methd to + quantize the weight from float32 to int8/16, and the abs max + value is important for quantization diff. When the abs_max + value is far away from the center of the numerical distribution, + we can set threshold_rate between 1e-6 and 1e-8, so the abs max + value will be optimized. Default is 0.0. + ''' + for op_type in quantizable_op_type: + assert op_type in self._supported_quantizable_op_type, \ + "input error:" + op_type + \ + " is not supported for weight quantization." + assert quantize_weight_bits in [8, 16], \ + "input error: quantize_weight_bits should be 8 or 16." + quantize_range = (1 << (quantize_weight_bits - 1)) - 1 + save_weight_dtype = np.int8 if quantize_weight_bits == 8 else np.int16 + + place = core.CPUPlace() + exe = Executor(place) + scope = global_scope() + [program, feed_list, fetch_list] = \ + io.load_inference_model(dirname=self._model_dir, + executor=exe, + model_filename=self._model_filename, + params_filename=self._params_filename) + + persistable_var_names = [] + for var in program.list_vars(): + if var.persistable: + persistable_var_names.append(var.name) + for op in program.global_block().ops: + if op.type in quantizable_op_type: + for var_name in op.input_arg_names: + if var_name in persistable_var_names: + var_tensor_data = _load_variable_data(scope, var_name) + if abs(threshold_rate) < 1e-10: + threshold_value = np.max(np.abs(var_tensor_data)) + else: + threshold_value = self._calculate_threshold(\ + var_tensor_data, threshold_rate) + var_tensor_data[var_tensor_data > + threshold_value] = threshold_value + var_tensor_data[var_tensor_data < + -threshold_value] = -threshold_value + scale = threshold_value / quantize_range + quantized_var_tensor_data = \ + np.around(var_tensor_data / scale) + quantized_var_tensor_data = \ + quantized_var_tensor_data.astype(save_weight_dtype) + _set_variable_data(scope, place, var_name, + quantized_var_tensor_data) + op._set_attr(var_name + "_quant_scale", [scale]) + op._set_attr('quantize_weight_bits', + quantize_weight_bits) + + io.save_inference_model( + dirname=save_model_dir, + feeded_var_names=feed_list, + target_vars=fetch_list, + executor=exe, + main_program=program, + model_filename=save_model_filename, + params_filename=save_params_filename) + + def _calculate_threshold(self, input, threshold_rate, histogram_bins=5000): + input_abs = np.abs(input) + hist, hist_edeges = np.histogram( + input_abs, bins=histogram_bins, range=(0, np.max(input_abs))) + hist = hist / float(sum(hist)) + hist_sum = 0 + hist_index = 0 + for i in range(len(hist)): + hist_sum += hist[i] + if hist_sum >= 1.0 - threshold_rate: + hist_index = i + 1 + break + bin_width = hist_edeges[1] - hist_edeges[0] + return hist_index * bin_width diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 2483917092..f1cc767097 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -58,6 +58,7 @@ if(WIN32) list(REMOVE_ITEM TEST_OPS test_light_nas) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50) + list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1) endif() # int8 image classification python api test diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py new file mode 100644 index 0000000000..c6380adf6b --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import time +from paddle.dataset.common import download, DATA_HOME +from paddle.fluid.contrib.slim.quantization import WeightQuantization + + +class TestWeightQuantization(unittest.TestCase): + def setUp(self): + self.weight_quantization_dir = 'weight_quantization' + self.cache_folder = os.path.join(DATA_HOME, + self.weight_quantization_dir) + + def download_model(self, model_name, data_url, data_md5): + download(data_url, self.weight_quantization_dir, data_md5) + file_name = data_url.split('/')[-1] + file_path = os.path.join(self.cache_folder, file_name) + print(model_name + ' is downloaded at ' + file_path) + + unziped_path = os.path.join(self.cache_folder, model_name) + self.cache_unzipping(unziped_path, file_path) + print(model_name + ' is unziped at ' + unziped_path) + return unziped_path + + def cache_unzipping(self, target_folder, zip_path): + if not os.path.exists(target_folder): + cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, + zip_path) + os.system(cmd) + + def run_test(self, model_name, model_data_url, model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate): + + model_dir = self.download_model(model_name, model_data_url, + model_data_md5) + + timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + save_model_dir = os.path.join( + os.getcwd(), + model_name + "_wq_" + str(quantize_weight_bits) + "_" + timestamp) + weight_quant = WeightQuantization(model_dir=model_dir + "/model") + weight_quant.quantize_weight_to_int( + save_model_dir=save_model_dir, + quantize_weight_bits=quantize_weight_bits, + quantizable_op_type=quantizable_op_type, + threshold_rate=threshold_rate) + print("finish weight quantization for " + model_name + "\n") + + try: + os.system("rm -rf {}".format(save_model_dir)) + except Exception as e: + print("Failed to delete {} due to {}".format(save_model_dir, str( + e))) + + +class TestWeightQuantizationMobilenetv1(TestWeightQuantization): + model_name = "mobilenetv1" + model_data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz" + model_data_md5 = "13892b0716d26443a8cdea15b3c6438b" + + def test_weight_quantization_mobilenetv1_8bit(self): + quantize_weight_bits = 8 + quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + threshold_rate = 0.0 + self.run_test(self.model_name, self.model_data_url, self.model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate) + + def test_weight_quantization_mobilenetv1_16bit(self): + quantize_weight_bits = 16 + quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + threshold_rate = 1e-9 + self.run_test(self.model_name, self.model_data_url, self.model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index 71dd49504a..24be25fda2 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -22,6 +22,12 @@ import numbers class TestTensor(unittest.TestCase): + def setUp(self): + self.support_dtypes = [ + 'bool', 'uint8', 'int8', 'int16', 'int32', 'int64', 'float16', + 'float32', 'float64' + ] + def test_int_tensor(self): scope = core.Scope() var = scope.var("test_tensor") @@ -184,15 +190,15 @@ class TestTensor(unittest.TestCase): tensor_array = numpy.array(tensor) self.assertEqual((0, 1), tensor_array.shape) - def run_sliece_tensor(self, place): - + def run_slice_tensor(self, place, dtype): tensor = fluid.Tensor() shape = [3, 3, 3] tensor._set_dims(shape) - tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [[10, 11, 12], [13, 14, 15], [16, 17, 18]], - [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]) + tensor_array = numpy.array( + [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [[10, 11, 12], [13, 14, 15], [16, 17, 18]], + [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype(dtype) tensor.set(tensor_array, place) n1 = tensor[1] @@ -227,14 +233,15 @@ class TestTensor(unittest.TestCase): t8 = tensor_array[0::1, 0::-1, 2:] self.assertTrue((numpy.array(n8) == numpy.array(t8)).all()) - def test_sliece_tensor(self): - # run cpu first - place = core.CPUPlace() - self.run_sliece_tensor(place) + def test_slice_tensor(self): + for dtype in self.support_dtypes: + # run cpu first + place = core.CPUPlace() + self.run_slice_tensor(place, dtype) - if core.is_compiled_with_cuda(): - place = core.CUDAPlace(0) - self.run_sliece_tensor(place) + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + self.run_slice_tensor(place, dtype) def test_print_tensor(self): scope = core.Scope() @@ -299,6 +306,25 @@ class TestTensor(unittest.TestCase): self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP16) self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + def test_tensor_set_int16(self): + array = numpy.random.randint(100, size=(300, 500)).astype("int16") + tensor = fluid.Tensor() + place = core.CPUPlace() + tensor.set(array, place) + self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) + self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + tensor.set(array, place) + self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) + self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + + place = core.CUDAPinnedPlace() + tensor.set(array, place) + self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) + self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + def test_tensor_set_from_array_list(self): array = numpy.random.randint(1000, size=(200, 300)) list_array = [array, array] -- GitLab