diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ed965aaa0b6042067b9e1c226232c459f8c519ac..1d0fa6b3765982d4b77984f2ce8f979b69037bee 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -21,7 +21,10 @@ from ....framework import Program from ....initializer import Constant from .... import unique_name -__all__ = ['QuantizationTransformPass', 'QuantizationFreezePass'] +__all__ = [ + 'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass', + 'TransformForMobilePass' +] class QuantizationTransformPass(object): @@ -394,6 +397,7 @@ class QuantizationFreezePass(object): # remove the unused var node in the graph self._remove_unused_var_nodes(graph) + return graph def _remove_fake_quant_and_dequant_op(self, graph, op_node): k = op_node.op().output('Out')[0] @@ -453,9 +457,9 @@ class QuantizationFreezePass(object): def _load_var(self, name): return np.array(self._scope.find_var(name).get_tensor()) - def _restore_var(self, name, arr): - t = self._scope.find_var(name).get_tensor() - t.set(arr, self._place) + def _restore_var(self, name, array): + tensor = self._scope.find_var(name).get_tensor() + tensor.set(array, self._place) def _remove_unused_var_nodes(self, graph): all_used_vars = set() @@ -496,3 +500,97 @@ class QuantizationFreezePass(object): def _quant(self, x, scale, num_bits): return np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + + +class ConvertToInt8Pass(object): + def __init__(self, scope, place): + assert scope is not None, \ + 'The scope cannot be set None.' + assert place is not None, \ + 'The place cannot be set None.' + self._scope = scope + self._place = place + self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + + def apply(self, graph): + persistable_vars = [p.name() for p in graph.all_persistable_vars()] + ops = graph.all_ops() + input_map = {} + for op_node in ops: + op_name = op_node.name() + if op_name in self._quantizable_ops: + for var_node in op_node.inputs: + name = var_node.name() + if name in persistable_vars: + if name not in input_map: + int8_var_node = self._convert_to_int8(graph, + var_node) + input_map[name] = int8_var_node + graph.update_input_link(var_node, input_map[name], + op_node) + + # remove the unused var node in the graph + self._remove_unused_var_nodes(graph) + return graph + + def _convert_to_int8(self, graph, var_node): + int8_var_node_name = var_node.name() + ".int8" + int8_var_node = graph.create_param_node( + name=cpt.to_text(int8_var_node_name), + var_type=var_node.var().type(), + shape=var_node.var().shape(), + var_dtype=core.VarDesc.VarType.INT8) + array = self._load_var(var_node.name()) + self._scope.var(int8_var_node_name) + self._store_var(int8_var_node_name, array, np.int8) + return int8_var_node + + def _load_var(self, name): + return np.array(self._scope.find_var(name).get_tensor()) + + def _store_var(self, name, array, dtype): + tensor = self._scope.find_var(name).get_tensor() + tensor.set(array.astype(dtype), self._place) + + def _remove_unused_var_nodes(self, graph): + all_used_vars = set() + ops = graph.all_ops() + for op_node in ops: + for input_node in op_node.inputs: + all_used_vars.add(input_node) + for output_node in op_node.outputs: + all_used_vars.add(output_node) + + all_unused_vars = graph.all_vars() - all_used_vars + graph.safe_remove_nodes(all_unused_vars) + + +class TransformForMobilePass(object): + def __init__(self): + self._fake_quant_op_names = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self._fake_dequant_op_names = ['fake_dequantize_max_abs'] + + def apply(self, graph): + ops = graph.all_ops() + for op_node in ops: + name = op_node.name() + if name in self._fake_quant_op_names: + op_node.op().set_type('quantize') + quant_node = graph.create_op_node_from_desc(op_node.op()) + for input_node in op_node.inputs: + graph.link_to(input_node, quant_node) + for output_node in op_node.outputs: + graph.link_to(quant_node, output_node) + graph.safe_remove_nodes(op_node) + if name in self._fake_dequant_op_names: + op_node.op().set_type('dequantize') + dequant_node = graph.create_op_node_from_desc(op_node.op()) + for input_node in op_node.inputs: + graph.link_to(input_node, dequant_node) + for output_node in op_node.outputs: + graph.link_to(dequant_node, output_node) + graph.safe_remove_nodes(op_node) + + return graph diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index bb8f51cc8ce3e49f8aa57063dd756b64674f6969..a8d750724641875215702c9b139adfda0fe17d7d 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -18,10 +18,11 @@ import numpy as np import paddle.fluid as fluid import six import paddle -from paddle.fluid.framework import Program from paddle.fluid.framework import IrGraph from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass +from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass +from paddle.fluid.contrib.slim.quantization import TransformForMobilePass from paddle.fluid import core @@ -233,10 +234,22 @@ class TestQuantizationFreezePass(unittest.TestCase): scope=scope, program_exe=exe, activation_quantize_type=quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) + dev_name = '_gpu_' if use_cuda else '_cpu_' + marked_nodes = set() + for op in main_graph.all_ops(): + if op.name().find('quantize') > -1: + marked_nodes.add(op) + main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes) + marked_nodes = set() + for op in test_graph.all_ops(): + if op.name().find('quantize') > -1: + marked_nodes.add(op) + test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) + quantized_main_program = main_graph.to_program() + quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 - dev_name = '_gpu_' if use_cuda else '_cpu_' train_reader = paddle.batch( paddle.reader.shuffle( @@ -248,66 +261,86 @@ class TestQuantizationFreezePass(unittest.TestCase): with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) - loss_v = exe.run(program=main_graph.to_program(), + loss_v = exe.run(program=quantized_main_program, feed=feeder.feed(data), fetch_list=[loss]) - print('{}: {}'.format(dev_name, loss_v)) + print('{}: {}'.format('loss' + dev_name + quant_type, loss_v)) + test_data = next(test_reader()) + with fluid.program_guard(quantized_test_program): + w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', + quantized_test_program) + # Testing + with fluid.scope_guard(scope): + test_loss1, w_quant = exe.run(program=quantized_test_program, + feed=feeder.feed(test_data), + fetch_list=[loss, w_var]) + + # Freeze graph for inference, but the weight of fc/conv is still float type. + freeze_pass = QuantizationFreezePass(scope=scope, place=place) + freeze_pass.apply(test_graph) marked_nodes = set() - for op in main_graph.all_ops(): + for op in test_graph.all_ops(): if op.name().find('quantize') > -1: marked_nodes.add(op) - main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes) + test_graph.draw('.', 'test_freeze' + dev_name + quant_type, + marked_nodes) - freeze_pass = QuantizationFreezePass(scope=scope, place=place) - origin_marked_nodes = set() + server_program = test_graph.to_program() + with fluid.scope_guard(scope): + test_loss2, = exe.run(program=server_program, + feed=feeder.feed(test_data), + fetch_list=[loss]) + self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) + print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1)) + print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2)) + w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) + # Maybe failed, this is due to the calculation precision + self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) + print('{}: {}'.format('w_freeze' + dev_name + quant_type, + np.sum(w_freeze))) + print('{}: {}'.format('w_quant' + dev_name + quant_type, + np.sum(w_quant))) + + # Convert parameter to 8-bit. + convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) + convert_int8_pass.apply(test_graph) + marked_nodes = set() for op in test_graph.all_ops(): if op.name().find('quantize') > -1: - origin_marked_nodes.add(op) - test_graph.draw('.', 'test_origin' + dev_name + quant_type, - origin_marked_nodes) - freeze_pass.apply(test_graph) - freeze_marked_nodes = set() + marked_nodes.add(op) + test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes) + server_program_int8 = test_graph.to_program() + # Save the 8-bit parameter and model file. + with fluid.scope_guard(scope): + fluid.io.save_inference_model('server_int8' + dev_name + quant_type, + ['image', 'label'], [loss], exe, + server_program_int8) + # Test whether the 8-bit parameter and model file can be loaded successfully. + [infer, feed, fetch] = fluid.io.load_inference_model( + 'server_int8' + dev_name + quant_type, exe) + # Check the loaded 8-bit weight. + w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) + self.assertEqual(w_8bit.dtype, np.int8) + self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit))) + print('{}: {}'.format('w_freeze' + dev_name + quant_type, + np.sum(w_freeze))) + + mobile_pass = TransformForMobilePass() + mobile_pass.apply(test_graph) + marked_nodes = set() for op in test_graph.all_ops(): if op.name().find('quantize') > -1: - freeze_marked_nodes.add(op) - test_graph.draw('.', 'test_freeze' + dev_name + quant_type, - freeze_marked_nodes) - - # with fluid.program_guard(test_program): - # test_data = next(test_reader()) - # w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', - # test_program) - # # Testing during training - # test_loss1, w_quant = exe.run(program=test_program, - # feed=feeder.feed(test_data), - # fetch_list=[loss, w_var]) - - # # Freeze program for inference, but the weight of fc/conv is still float type. - # quant_transpiler.freeze_program(test_program, place) - # test_loss2, = exe.run(program=test_program, - # feed=feeder.feed(test_data), - # fetch_list=[loss]) - # self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) - # w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') - # .get_tensor()) - # # fail: -432.0 != -433.0, this is due to the calculation precision - # #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) - - # # Convert parameter to 8-bit. - # quant_transpiler.convert_to_int8(test_program, place) - # # Save the 8-bit parameter and model file. - # fluid.io.save_inference_model('model_8bit', ['image', 'label'], - # [loss], exe, test_program) - # # Test whether the 8-bit parameter and model file can be loaded successfully. - # [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit', - # exe) - # # Check the loaded 8-bit weight. - # w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8') - # .get_tensor()) - - # self.assertEqual(w_8bit.dtype, np.int8) - # self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + marked_nodes.add(op) + test_graph.draw('.', 'test_mobile' + dev_name + quant_type, + marked_nodes) + + mobile_program = test_graph.to_program() + with fluid.scope_guard(scope): + fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type, + ['image', 'label'], [loss], exe, + mobile_program) def test_freeze_program_cuda_dynamic(self): if fluid.core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py index 86fa84ad4bd7a55fb27f4e43128f0bfda6dfe6db..ade2a388f23d9ea5ddf34fe2c1ac29df7e25effc 100644 --- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase): build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) - quant_transpiler = QuantizeTranspiler() - quant_transpiler.training_transpile(main) - quant_transpiler.training_transpile(test_program) + quant_type = 'abs_max' + quant_transpiler = QuantizeTranspiler( + activation_quantize_type=quant_type) + quant_transpiler.training_transpile(main, startup) + quant_transpiler.training_transpile(test_program, startup) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) @@ -223,12 +225,14 @@ class TestQuantizeTranspiler(unittest.TestCase): paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) + dev_name = '_gpu_' if use_cuda else '_cpu_' with fluid.program_guard(main): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(program=main, feed=feeder.feed(data), fetch_list=[loss]) + print('{}: {}'.format('loss' + dev_name + quant_type, loss_v)) with fluid.program_guard(test_program): test_data = next(test_reader()) @@ -245,11 +249,19 @@ class TestQuantizeTranspiler(unittest.TestCase): feed=feeder.feed(test_data), fetch_list=[loss]) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) + print('{}: {}'.format('test_loss1' + dev_name + quant_type, + test_loss1)) + print('{}: {}'.format('test_loss2' + dev_name + quant_type, + test_loss2)) w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') .get_tensor()) # fail: -432.0 != -433.0, this is due to the calculation precision #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) + print('{}: {}'.format('w_freeze' + dev_name + quant_type, + np.sum(w_freeze))) + print('{}: {}'.format('w_quant' + dev_name + quant_type, + np.sum(w_quant))) # Convert parameter to 8-bit. quant_transpiler.convert_to_int8(test_program, place) # Save the 8-bit parameter and model file. @@ -264,13 +276,17 @@ class TestQuantizeTranspiler(unittest.TestCase): self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + print('{}: {}'.format('w_8bit' + dev_name + quant_type, + np.sum(w_8bit))) + print('{}: {}'.format('w_freeze' + dev_name + quant_type, + np.sum(w_freeze))) - def not_test_freeze_program_cuda(self): + def test_freeze_program_cuda(self): if fluid.core.is_compiled_with_cuda(): with fluid.unique_name.guard(): self.freeze_program(True, seed=1) - def not_test_freeze_program_cpu(self): + def test_freeze_program_cpu(self): with fluid.unique_name.guard(): self.freeze_program(False, seed=2)