diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d26eebc8ff64784bdd9c37e123f49f7471ee0d50..def7ad9c40502c0a8e086780fd8b91b09dde2dd3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -299,6 +299,10 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 9cdcb87df5dd1669066c204c86c269973df506f1..1c5ded943b3814688af1f177503d3bdc35073c3f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -87,6 +87,7 @@ if (WITH_TESTING) endif() endif() add_subdirectory(paddle/fluid/tests) + add_subdirectory(paddle/fluid/contrib/tests) endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index c82bc0b940a32bf584e87646442c2507864c2285..3bf2fe5db0cb2126295ebfda822eeac8762dbdb7 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -20,8 +20,11 @@ from . import memory_usage_calc from .memory_usage_calc import * from . import op_frequence from .op_frequence import * +from . import quantize +from .quantize import * __all__ = [] __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ +__all__ += quantize.__all__ diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..14c208d0e7f35ebfbbe1c36d0b11a8d0f0efb4a6 --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import quantize_transpiler +from .quantize_transpiler import * + +__all__ = quantize_transpiler.__all__ diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..032d0353ea6d80c4356ea9a9886ea59c48feec7a --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py @@ -0,0 +1,557 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np + +from paddle.fluid.framework import default_main_program, default_startup_program, program_guard +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid import unique_name +from paddle.fluid import core +from paddle.fluid.initializer import Constant +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.nn import autoincreased_step_counter +from paddle.fluid.framework import Variable +from paddle.fluid.executor import global_scope +from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler + +__all__ = ['QuantizeTranspiler'] + +_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul'] + + +def _quantized_var_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.quantized" % (var_name) + + +def _dequantized_var_name(var_name): + """ + Return dequantized variable name for the input `var_name`. + """ + return "%s.dequantized" % (var_name) + + +def _quantized_scale_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.scale" % (var_name) + + +def _original_var_name(var_name): + """ + Return the original variable name. + """ + if var_name.endswith('.quantized.dequantized'): + return var_name[:-len('.quantized.dequantized')] + if var_name.endswith('.quantized'): + return var_name[:-len('.quantized')] + if var_name.endswith('.dequantized'): + return var_name[:-len('.dequantized')] + if var_name.endswith('.scale'): + return var_name[:-len('.scale')] + else: + return var_name + + +def _is_float(v): + return isinstance(v, float) or isinstance(v, np.float32) + + +def quant(x, scale, num_bits): + y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + return y + + +class QuantizeTranspiler(object): + def __init__(self, + weight_bits=8, + activation_bits=8, + activation_quantize_type='abs_max', + weight_quantize_type='abs_max', + window_size=10000): + """ + Convert and rewrite the fluid Program according to weight and + activation quantization type. + + Args: + weight_bits (int): quantization bit number for weights, + the bias is not quantized. + activation_bits (int): quantization bit number for activation. + activation_quantize_type (str): quantization type for activation, + now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, + the quantization scale will be calculated dynamically each step + in both training and testing period. If use 'range_abs_max', + a static quantization scale will be calculated during training + and used in inference. + weight_quantize_type (str): quantization type for weights, + support 'abs_max'. The 'range_abs_max' usually is not used for + weight, since weights are fixed once the model is well trained. + window_size (int): the window size for 'range_abs_max' quantization. + + Examples: + + .. code-block:: python + + # the original program will be rewrite, if you don't want to + # change it, please clone at first. + # quantize_program = program.clone() + t = fluid.QuantizeTranspiler() + t.transpile(quantize_program) + + """ + self.weight_bits = weight_bits + self.activation_bits = activation_bits + quant_type = ['abs_max', 'range_abs_max'] + if weight_quantize_type not in quant_type: + raise ValueError( + "Unknown weight_quantize_type: '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) + if activation_quantize_type not in quant_type: + raise ValueError( + "Unknown activation_quantize_type : '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) + + self.weight_quantize_type = weight_quantize_type + self.activation_quantize_type = activation_quantize_type + + self.window_size = window_size + self.helper = LayerHelper(self.__class__.__name__) + self.fake_quant_op_types = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self.fake_dequant_op_types = ['fake_dequantize_max_abs'] + self.is_test = None + self.global_step = None + + def training_transpile(self, program=None, startup_program=None): + """Rewrites a training input program in place for simulated + quantization. Insert fake quantization and de-quantization ops into + program to simulate the error introduced by quantization. And change + the graident ops' input by using the faked quantization weights and + activation. Since the program is transformed in place, the graph + connection will change. + + Args: + program (Program): the input program to be transpile. + """ + self.is_test = False + program = default_main_program() if program is None else program + startup_program = default_startup_program() if startup_program is \ + None else startup_program + + # marked the variable which has been quantized and dequantized. + dequanted_vars = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES] + + params = [p.name for p in program.global_block().iter_parameters()] + + def _transpile_forward(block, op): + idx = block.ops.index(op) + block_id = block.idx + # insert quant op and dequant op + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + else: + var = block.var(name) + quant_bits = self.weight_bits if var.name in params \ + else self.activation_bits + quant_type = self.weight_quantize_type if var.name \ + in params else self.activation_quantize_type + + quant_var, scale_var = self._insert_quant_op( + block, idx, var, quant_bits, quant_type) + dequant_var = self._insert_dequant_op( + block, idx + 1, quant_var, scale_var, quant_bits) + dequanted_vars[block_id][name] = dequant_var + # rename the forward op inputs + op._rename_input(name, dequant_var.name) + + def _transpile_backward(block, op): + block_id = block.idx + no_dequanted_input_vars = True + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + op._rename_input(name, dequant_var.name) + no_dequanted_input_vars = False + if no_dequanted_input_vars: + raise ValueError("There is no dequanted inputs for op %s." % + (op.type)) + + with program_guard(program, startup_program): + self._create_global_step() + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + # rewrite the forward ProgramDes + if op.type in _QUANTIZABLE_OP_TYPES: + _transpile_forward(block, op) + # rename the backward op inputs + if op.type in grad_op_types: + _transpile_backward(block, op) + + def _create_global_step(self): + if self.weight_quantize_type == 'range_abs_max' or \ + self.activation_quantize_type == 'range_abs_max': + self.global_step = autoincreased_step_counter() + + def freeze_program(self, program, place, fuse_bn=False, scope=None): + """Freeze input training program for inference. + + Args: + program (Program): the input program to be transpile. + """ + + self.is_test = True + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + if fuse_bn: + bn_fuse_transpiler = BNFuseTranspiler() + bn_fuse_transpiler.transpile(program, place) + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + op_in_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + op_out_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + var_scale_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + + def _remove_fake_quant_and_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + k = op.output('Out')[0] + v = op.input('X')[0] + if v not in op_in_rename_map[block_id]: + op_in_rename_map[block_id][k] = v + else: + op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v] + block._remove_op(idx) + + def _insert_post_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + max_range = None + scale_var = None + for name in op.input_arg_names: + if name in op_in_rename_map[block_id]: + op._rename_input(name, op_in_rename_map[block_id][name]) + + scale_v = var_scale_map[block_id][_original_var_name(name)] + if _original_var_name(name) in persistable_vars: + param_range = (1 << (self.weight_bits - 1)) - 1 + act_range = (1 << (self.activation_bits - 1)) - 1 + assert _is_float(scale_v) + max_range = param_range * act_range / scale_v + else: + assert isinstance(scale_v, Variable) + scale_var = var_scale_map[block_id][_original_var_name( + name)] + + if len(op.output_arg_names) != 1: + raise ValueError("Only support one output, but op %s has" + " more than one output." % (op.type)) + out_var = block.var(op.output_arg_names[0]) + dequant_var = block.create_var( + name=_dequantized_var_name(out_var.name), + type=out_var.type, + shape=out_var.shape, + dtype=out_var.dtype) + # insert fake_dequantize_op + dequant_op = block._insert_op( + idx + 1, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": out_var, + 'Scale': scale_var}, + outputs={"Out": dequant_var}) + op_out_rename_map[block_id][out_var.name] = dequant_var.name + return dequant_var + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + def _restore_var(name, arr): + t = scope.find_var(name).get_tensor() + t.set(arr, place) + + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + op_type = op.type + + # insert dequant_op after fc/conv, need to rename + # input of the followed ops + for name in op.input_arg_names: + if name in op_out_rename_map[block_id]: + op._rename_input(name, + op_out_rename_map[block_id][name]) + + if op_type in self.fake_quant_op_types: + in_arg_name = op.input('X')[0] + if in_arg_name in persistable_vars: + if self.weight_quantize_type == 'abs_max': + param = _load_var(in_arg_name) + scale_v = np.max(np.abs(param)) + else: + scale_v = _load_var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + else: + scale_v = block.var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + + if in_arg_name in persistable_vars: + _remove_fake_quant_and_dequant_op(block, op) + # quantize weight and restore + param_t = _load_var(in_arg_name) + param_q_t = quant(param_t, scale_v, self.weight_bits) + _restore_var(in_arg_name, param_q_t) + + if op_type in self.fake_dequant_op_types: + _remove_fake_quant_and_dequant_op(block, op) + + if op_type in _QUANTIZABLE_OP_TYPES: + dequant_var = _insert_post_dequant_op(block, op) + + # remove the unused var in ProgramDesc + self._remove_unused_var(program) + #program = program.clone() + + def convert_to_int8(self, program, place, scope=None): + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + global_block = program.global_block() + + def convert_to_int8(var): + int8_var_name = var.name + ".int8" + int8_var = global_block.create_parameter( + name=int8_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.INT8, + shape=var.shape) + + tensor = _load_var(var.name) + + scope.var(int8_var_name) + int8_tensor = scope.find_var(int8_var_name).get_tensor() + int8_tensor.set(tensor.astype(np.int8), place) + return int8_var + + input_map = {} + for block in program.blocks: + for op in list(block.ops): + if op.type in _QUANTIZABLE_OP_TYPES: + for name in op.input_arg_names: + var = block.var(name) + if var.persistable: + if name not in input_map: + int8_var = convert_to_int8(var) + input_map[name] = int8_var.name + op._rename_input(name, input_map[name]) + self._remove_unused_var(program) + + def _remove_unused_var(self, program): + all_remove_vars = [] + for block in program.blocks: + args = [] + for op in block.ops: + args += op.input_arg_names + args += op.output_arg_names + args = list(set(args)) + var_names = block.vars.keys() + sub_block_remove_vars = [] + for var in var_names: + if var not in args: + sub_block_remove_vars.append(var) + all_remove_vars.append(sub_block_remove_vars) + + remove_vars = [list(set(v)) for v in all_remove_vars] + for i, block in enumerate(program.blocks): + for v in remove_vars[i]: + block._remove_var(v) + + def _insert_quant_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_abs_max op. + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = block.create_var( + name=_quantized_scale_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + quant_op = block._insert_op( + idx, + type='fake_quantize_abs_max', + attrs={'bit_length': quant_bits}, + inputs={'X': var}, + outputs={'Out': quant_var, + 'OutScale': scale}) + return quant_var, scale + + def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_range_abs_max + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = self.helper.create_parameter( + attr=ParamAttr( + name=_quantized_scale_name(var.name), + initializer=Constant(0.001), + trainable=False), + shape=[1], + dtype=var.dtype) + scale.stop_gradient = True + + ins = {'X': var, 'InScale': scale} + outs = {'Out': quant_var, 'OutScale': scale} + if not self.is_test: + # A global step counter variable with type int64 + scales = self.helper.create_global_variable( + name=unique_name.generate('scales'), + persistable=True, + dtype=var.dtype, + shape=[self.window_size]) + self.helper.set_variable_initializer( + scales, initializer=Constant(value=0)) + + ins['Iter'] = self.global_step + outs['OutScales'] = scales + + attrs = { + 'window_size': self.window_size, + 'bit_length': quant_bits, + 'is_test': self.is_test + } + + quant_op = block._insert_op( + idx, + type='fake_quantize_range_abs_max', + attrs=attrs, + inputs=ins, + outputs=outs) + + return quant_var, scale + + def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): + """ + Insert fake_quantize_op + """ + if quant_type == 'abs_max': + return self._insert_quant_abs_max_op(block, idx, var, quant_bits) + elif quant_type == 'range_abs_max': + return self._insert_quant_range_abs_max_op(block, idx, var, + quant_bits) + + def _insert_dequant_op(self, block, idx, var, scale, quant_bits): + """ + Insert fake_quantize_op + """ + dequant_var = block.create_var( + name=_dequantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + # insert fake_dequantize_op + max_range = (1 << (quant_bits - 1)) - 1 + dequant_op = block._insert_op( + idx, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": var, + 'Scale': scale}, + outputs={"Out": dequant_var}) + return dequant_var + + +class BNFuseTranspiler(InferenceTranspiler): + def _fuse_param(self, current_op, bn_op, bias_op, with_bias): + def _update_param(op, param_name, new_param): + var = self.block.vars[param_name] + tensor = self.scope.find_var(param_name).get_tensor() + tensor.set(np.array(new_param), self.place) + + def _load_param(param_name): + return np.array(self.scope.find_var(param_name).get_tensor()) + + bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias + scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale + mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean + var_bn = _load_param(bn_op.input("Variance")[0]) #Variance + + if current_op.type in ['conv2d', 'depthwise_conv2d']: + current_param = _load_param( + _original_var_name(current_op.input("Filter")[0])) + elif current_op.type == 'mul': + current_param = _load_param( + _original_var_name(current_op.input("Y")[0])) + + std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5))) + tmp = np.float32(np.divide(scale_bn, std_bn)) + + # add bias of batch_norm_op to conv2d + if with_bias: + bias = _load_param(bias_op.input("Y")) + else: + bias = np.zeros(bias_bn.shape) + bias = np.float32( + np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn)) + + # re-compute weight of conv2d/fc + tmp = tmp.reshape(tmp.shape[0], -1) + dst_param = current_param.reshape((tmp.shape[0], -1)) + dst_param = np.float32(np.multiply(dst_param, tmp)) + dst_param = dst_param.reshape(current_param.shape) + + # update parameters + if current_op.type in ['conv2d', 'depthwise_conv2d']: + _update_param(current_op, + _original_var_name(current_op.input("Filter")[0]), + dst_param) + elif current_op.type == 'mul': + _update_param(current_op, + _original_var_name(current_op.input("Y")[0]), + dst_param) + + _update_param(bias_op, bias_op.input("Y")[0], bias) + + # collect the renamed input + self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..9af3a6c9fda121d411a8a19f3928238be84fe8a6 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -0,0 +1,272 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import numpy as np +import six + +import unittest +import paddle +import paddle.fluid as fluid +from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name +from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler + + +def linear_fc(num): + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def conv_net(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + return avg_loss + + +class TestQuantizeTranspiler(unittest.TestCase): + def setUp(self): + # since quant_op and dequant_op is not ready, use cos and sin for test + self.weight_quant_op_type = 'fake_quantize_abs_max' + self.dequant_op_type = 'fake_dequantize_max_abs' + self.quantizable_op_and_inputs = { + 'conv2d': ['Input', 'Filter'], + 'depthwise_conv2d': ['Input', 'Filter'], + 'mul': ['X', 'Y'] + } + self.quantizable_op_grad_and_inputs = { + 'conv2d_grad': ['Input', 'Filter'], + 'depthwise_conv2d_grad': ['Input', 'Filter'], + 'mul_grad': ['X', 'Y'] + } + + def check_program(self, program): + quantized_ops = {} + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + + for block in program.blocks: + for idx, op in enumerate(block.ops): + # check forward + if op.type in self.quantizable_op_and_inputs: + for i, arg_name in enumerate(op.input_arg_names): + quant_op_type = self.weight_quant_op_type if \ + _original_var_name(arg_name) \ + in persistable_vars else self.act_quant_op_type + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + if arg_name not in quantized_ops: + self.assertEqual(block.ops[idx - 2 * i - 1].type, + self.dequant_op_type) + self.assertEqual(block.ops[idx - 2 * i - 2].type, + quant_op_type) + quantized_ops[arg_name] = block.ops[idx - 2 * i - 2] + else: + op_idx = block.ops.index(quantized_ops[arg_name]) + self.assertLess(op_idx, idx) + + # check backward + if op.type in self.quantizable_op_grad_and_inputs: + for pname in self.quantizable_op_grad_and_inputs[op.type]: + arg_name = op.input(pname)[0] + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + self.assertTrue(arg_name in quantized_ops) + + def linear_fc_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = linear_fc(3) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_linear_fc_quant_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.linear_fc_quant('abs_max') + + def test_linear_fc_quant_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.linear_fc_quant('range_abs_max') + + def residual_block_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = residual_block(2) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_residual_block_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.residual_block_quant('abs_max') + + def test_residual_block_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.residual_block_quant('range_abs_max') + + def freeze_program(self, use_cuda): + def build_program(main, startup, is_test): + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + img = fluid.layers.data( + name='image', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + loss = conv_net(img, label) + if not is_test: + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + return [img, label], loss + + main = fluid.Program() + startup = fluid.Program() + test_program = fluid.Program() + + feeds, loss = build_program(main, startup, False) + build_program(test_program, startup, True) + test_program = test_program.clone(for_test=True) + + quant_transpiler = QuantizeTranspiler() + quant_transpiler.training_transpile(main) + quant_transpiler.training_transpile(test_program) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + iter = 5 + batch_size = 8 + class_num = 10 + exe.run(startup) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + + with fluid.program_guard(main): + for _ in range(iter): + data = next(train_reader()) + loss_v = exe.run(program=main, + feed=feeder.feed(data), + fetch_list=[loss]) + + with fluid.program_guard(test_program): + test_data = next(test_reader()) + w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', + test_program) + # Testing during training + test_loss1, w_quant = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss, w_var]) + + # Freeze program for inference, but the weight of fc/conv is still float type. + quant_transpiler.freeze_program(test_program, place) + test_loss2, = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss]) + self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3) + w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') + .get_tensor()) + self.assertEqual(np.sum(w_freeze), np.sum(w_quant)) + + # Convert parameter to 8-bit. + quant_transpiler.convert_to_int8(test_program, place) + # Save the 8-bit parameter and model file. + fluid.io.save_inference_model('model_8bit', ['image', 'label'], + [loss], exe, test_program) + # Test whether the 8-bit parameter and model file can be loaded successfully. + [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit', + exe) + # Check the loaded 8-bit weight. + w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8') + .get_tensor()) + + self.assertEqual(w_8bit.dtype, np.int8) + self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + + def test_freeze_program_cuda(self): + if fluid.core.is_compiled_with_cuda(): + with fluid.unique_name.guard(): + self.freeze_program(True) + + def test_freeze_program_cpu(self): + with fluid.unique_name.guard(): + self.freeze_program(False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 28c7ae5341b20f0f79da8cf682d279fc4cc3fa19..c9a8176a72fb744963ae466e965a25bdfb0a44de 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory from .ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "memory_optimize", "release_memory", "HashName", - "RoundRobin", "DistributeTranspilerConfig" + "DistributeTranspiler", + "memory_optimize", + "release_memory", + "HashName", + "RoundRobin", + "DistributeTranspilerConfig", ] diff --git a/python/setup.py.in b/python/setup.py.in index 786c9f2e39880b68700b8acb94b3d35a48323958..b376be0ea373f089ef17f27435d979712fbdff72 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -106,6 +106,7 @@ packages=['paddle', 'paddle.fluid.layers', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', + 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']