提交 19da59ed 编写于 作者: 翟飞跃 提交者: Tao Luo

Remove all the code, API and doc of MKL-DNN INT8v1 (#18347)

上级 8ed33bf9
...@@ -403,9 +403,6 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_ ...@@ -403,9 +403,6 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5')) paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5'))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0')) paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0'))
......
...@@ -22,8 +22,6 @@ from . import op_frequence ...@@ -22,8 +22,6 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import int8_inference
from .int8_inference import *
from . import reader from . import reader
from .reader import * from .reader import *
from . import slim from . import slim
...@@ -44,7 +42,6 @@ __all__ += decoder.__all__ ...@@ -44,7 +42,6 @@ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += int8_inference.__all__
__all__ += reader.__all__ __all__ += reader.__all__
__all__ += slim.__all__ __all__ += slim.__all__
__all__ += utils.__all__ __all__ += utils.__all__
......
# Offline INT8 Calibration Tool
PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy.
## 0. Prerequisite
You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
## 1. How to generate INT8 model
You can refer to the unit test in [test_calibration_resnet50.py](../tests/test_calibration_resnet50.py). Basically, there are three steps:
* Construct calibration object.
```python
calibrator = int8_utility.Calibrator( # Step 1
program=infer_program, # required, FP32 program
pretrained_model=model_path, # required, FP32 pretrained model
algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence)
exe=exe, # required, executor
output=int8_model, # required, INT8 model
feed_var_names=feed_dict, # required, feed dict
fetch_list=fetch_targets) # required, fetch targets
```
* Call the calibrator.sample_data() after executor run.
```python
_, acc1, _ = exe.run(
program,
feed={feed_dict[0]: image,
feed_dict[1]: label},
fetch_list=fetch_targets)
calibrator.sample_data() # Step 2
```
* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50)
```python
calibrator.save_int8_model() # Step 3
```
## 2. How to run INT8 model
You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32").
```python
[infer_program, feed_dict,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
```
## 3. Result
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 (single core).
**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.23% | 0.40% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.47% | 0.31% |
**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 11.54 images/s | 32.2 images/s | 2.79 |
| MobileNet-V1 | Full ImageNet Val | 49.21 images/s | 108.37 images/s | 2.2 |
Please note that [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset") can be downloaded by script `test_calibration.py` with `DATASET=full`.
Notes:
* The accuracy measurement requires the model with `label`.
* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `The theoretical peak compute gains are 4x int8 OPS over fp32 OPS.` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). Therefore, op-level gain is 4X and topology-level is smaller.
## 4. How to reproduce the results
* Small dataset for ResNet-50 (Single core)
```bash
FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
```
>Note: Change `test_calibration_resnet50.py` to `test_calibration_mobilenetv1.py` for MobileNet-V1. Same for the following commands.
* Full dataset for ResNet-50 (Single core)
```bash
FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
```
* Full dataset for ResNet-50 (Multi-core)
```bash
FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
```
> Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import utility
from .utility import *
__all__ = utility.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid import core
import numpy as np
import math
import os
from paddle.fluid.executor import global_scope
from paddle.fluid import io
__all__ = ['Calibrator']
class Calibrator(object):
'''
The calibrator class transforms the program and updates the calculated scale into it.
This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
'''
# TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
non_conv_int8_op_type = ("pool2d")
supported_int8_op_type = ("conv2d", "pool2d")
const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
u8_max = 255
s8_max = 127
def __init__(self, *args, **kwargs):
self.program = kwargs['program']
self.pretrained_model = kwargs['pretrained_model']
self.debug = kwargs['debug'] if 'debug' in kwargs else False
self.algo = kwargs['algo']
self.output = kwargs['output']
self.feed_var_names = kwargs['feed_var_names']
self.fetch_list = kwargs['fetch_list']
self.exe = kwargs['exe']
self._conv_input_var_name = []
self._conv_output_var_name = []
self._pool2d_output_var_name = []
self._weights_var_name = []
self._residual_input_var_name = []
self._int8_output_var_op_index_dict = {}
self._conv_op_index = [
index for index, value in enumerate(self.program.global_block().ops)
if value.type == 'conv2d'
]
self._var_max_value_map = {}
self._var_max_range = {}
self._weights_scaling_factor = {}
self._u8_output_var = []
self._s8_output_var = []
self._persistable_vars = []
self._sampling_data = {}
self.__init_analysis()
self.__generate_output_program()
def save_int8_model(self):
self.__sampling(self._sampling_data)
self.__save_scale()
self.__update_program()
self.__update_output_program_attr()
self.__display_debug()
self.__save_offline_model()
def sample_data(self):
'''
Sampling the tensor data of variable.
'''
for i in self.sampling_program.list_vars():
if i.name in self.sampling_vars:
np_data = np.array(global_scope().find_var(i.name).get_tensor())
if i.name not in self._sampling_data:
self._sampling_data[i.name] = []
self._sampling_data[i.name].append(np_data)
def __save_offline_model(self):
'''
Save the quantized model to the disk.
'''
io.save_inference_model(self.output, self.feed_var_names,
self.fetch_list, self.exe,
self.sampling_program)
def __display_debug(self):
if self.debug:
self.__dot(self._output_program)
print(self._output_program)
def __get_max_range_by_var_name(self, program, var_name):
"""
Check the specified variable was generated from Relu layer or not.
If the variable was the output of one of the pool2d/reshape/concat
/transpose, we keep trace the ancestor of this variable;
If the variable was the output the conv op, we check it's has_relu
attr;
Otherwise, we return the Calibrator.s8 as default value.
Returns:
Return Calibrator.u8_max if the variable was generated by Relu,
otherwise it will returns Calibrator.s8
"""
search_end_index = -1
input_index_name = {}
output_index_name = {}
ops_type = []
for index, op in enumerate(program.current_block().ops):
ops_type.append(op.type)
input_index_name[index] = op.input_arg_names
output_index_name[index] = op.output_arg_names
if var_name in op.output_arg_names:
search_end_index = index
# analysis
while search_end_index >= 0:
if ops_type[search_end_index] == "relu":
return Calibrator.u8_max
input_name = input_index_name[search_end_index][0]
for i in output_index_name.keys():
if input_name in output_index_name[i]:
search_end_index = i
break
if ops_type[
search_end_index] not in Calibrator.const_sign_op_type and ops_type[
search_end_index] != 'conv2d':
return Calibrator.s8_max
if ops_type[search_end_index] != 'conv2d':
continue
if program.current_block().ops[search_end_index].has_attr(
'fuse_relu') and program.current_block().ops[
search_end_index].attr('fuse_relu'):
return Calibrator.u8_max
else:
return Calibrator.s8_max
return Calibrator.s8_max
def __check_op_type_with_specified_var_as_input(self,
program,
var_name,
start_index=0):
'''
Check whether all the type of ops that use the specified variable as the
input.If one of those op is not int8-enabled, return False.
'''
op_type_list = [
op.type for op in program.current_block().ops[start_index:]
if var_name in op.input_arg_names
]
for i in op_type_list:
if not i in Calibrator.supported_int8_op_type:
return False
return True
def __check_var_source_dt(self, var_name):
'''
Check whether the specified variable is the output of int8 conv op or not.
If true, return the original op index.
If false, return -1
'''
return self._int8_output_var_op_index_dict[
var_name] if var_name in self._int8_output_var_op_index_dict else -1
def __update_int8_output_var_op_index_dict(self, index, var_name=None):
'''
Update the int8_output_variable/op_index dictionary
'''
for k, v in self._int8_output_var_op_index_dict.items():
if v >= index:
self._int8_output_var_op_index_dict[k] = v + 1
if var_name:
self._int8_output_var_op_index_dict[var_name] = index
def __update_program(self):
'''
Update the program with the quantize/dequantize op insertion.
'''
quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
self._output_program)
inserted_op_length = 0
calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
insert_op_collection = sorted(quantize_index + dequantize_index)
for index in insert_op_collection:
if index in quantize_index:
quantize_tmp = self._output_program.current_block().create_var(
name="quantize_{}_tmp".format(index),
dtype=core.VarDesc.VarType.UINT8)
original_out_name = self._output_program.current_block().ops[
index + inserted_op_length - 1].output_names[0]
original_out = self._output_program.current_block().ops[
index + inserted_op_length - 1].output(original_out_name)[0]
op = self._output_program.current_block()._insert_op(
index=index + inserted_op_length,
type="quantize",
inputs={"Input": original_out},
outputs={"Output": quantize_tmp}, )
op._set_attr("data_format", "MKLDNNLAYOUT")
op._set_attr("use_mkldnn", 1)
op._set_attr(
"Scale", self._var_max_range[original_out] /
calc_max_func(self._var_max_value_map[original_out]))
if self.__get_max_range_by_var_name(
self._output_program,
original_out) == Calibrator.s8_max:
op._set_attr("is_negative_input", 1)
self.__update_int8_output_var_op_index_dict(
index + inserted_op_length, "quantize_{}_tmp".format(index))
inserted_op_length += 1
for op in self._output_program.current_block().ops[
index + inserted_op_length:]:
for j in op.input_names:
if op.input(j) and op.input(
j
)[0] == original_out and op.type in Calibrator.supported_int8_op_type:
op.desc.set_input(j,
["{}".format(quantize_tmp.name)])
else:
start_index = index + inserted_op_length
dequantize_tmp_var = self._output_program.current_block(
).create_var(
name="dequantize_{}_tmp".format(index + 1),
dtype="float32", )
original_out_var = None
for original_input in self._output_program.current_block().ops[
start_index].input_arg_names:
index_res = self.__get_op_index_by_output_var(
self._output_program, original_input)
if index_res != -1:
original_out_var = original_input
break
if original_out_var:
op = self._output_program.current_block()._insert_op(
index=start_index,
type="dequantize",
inputs={"Input": original_out_var},
outputs={"Output": dequantize_tmp_var})
op._set_attr("data_format", "MKLDNNLAYOUT")
op._set_attr("use_mkldnn", 1)
op._set_attr("Scale", self._var_max_range[original_out_var]
/ calc_max_func(self._var_max_value_map[
original_out_var]))
for op_index in range(
start_index + 1,
len(self._output_program.current_block().ops)):
if self._output_program.current_block(
).ops[op_index].type == "conv2d" and self._output_program.current_block(
).ops[op_index].attr("force_fp32_output"):
continue
else:
for j in self._output_program.current_block().ops[
op_index].input_names:
if len(self._output_program.current_block().ops[
op_index].input(j)
) and self._output_program.current_block(
).ops[op_index].input(j)[
0] == original_out_var:
self._output_program.current_block(
).ops[op_index].desc.set_input(
j,
["{}".format(dequantize_tmp_var.name)])
inserted_op_length += 1
op._set_attr("data_format", "MKLDNNLAYOUT")
op._set_attr("use_mkldnn", 1)
def __update_output_program_attr(self):
for i in self._output_program.list_vars():
if i.name in self._persistable_vars:
i.persistable = False
os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
for i in self._u8_output_var:
self._output_program.current_block().var(i).desc.set_dtype(
core.VarDesc.VarType.UINT8)
for i in self._s8_output_var:
self._output_program.current_block().var(i).desc.set_dtype(
core.VarDesc.VarType.INT8)
@property
def sampling_program(self):
return self._output_program
@property
def sampling_vars(self):
return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
def __generate_output_program(self):
for i in self.program.list_vars():
if not i.persistable and i.name in self.sampling_vars:
i.persistable = True
self._persistable_vars.append(i.name)
self._output_program = self.program.clone()
def __save_scale(self):
'''
Update the convolution scale information.
'''
func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
for i in self._conv_op_index[1:]:
weights_var_name = self.program.current_block().ops[i].input(
'Filter')[0]
input_var_name = self.program.current_block().ops[i].input('Input')[
0]
output_var_name = self.program.current_block().ops[i].output(
'Output')[0]
self._output_program.current_block().ops[i]._set_attr(
"Scale_weights", self._weights_scaling_factor[weights_var_name])
self._output_program.current_block().ops[i]._set_attr(
"Scale_in", self._var_max_range[input_var_name] /
func(self._var_max_value_map[input_var_name]))
self._output_program.current_block().ops[i]._set_attr(
"Scale_out", self._var_max_range[output_var_name] /
func(self._var_max_value_map[output_var_name]))
if self._output_program.current_block().ops[i].desc.input(
"ResidualData"):
residual_var_name = self._output_program.current_block().ops[
i].desc.input("ResidualData")[0]
self._output_program.current_block().ops[i]._set_attr(
"Scale_in_eltwise", self._var_max_range[residual_var_name] /
func(self._var_max_value_map[residual_var_name]))
def __sampling(self, sampling_data):
'''
Sampling the variables data range.
'''
for i in self.program.list_vars():
if i.name not in self.sampling_vars:
continue
if i.name in self._weights_var_name:
scaling_factor_per_channel = []
data = sampling_data[i.name][0]
for j in range(data.shape[0]):
var_value = float(np.max(np.abs(data[j])))
if not self._is_close(var_value, 0.0):
scaling_factor_per_channel.append(Calibrator.s8_max /
var_value)
else:
scaling_factor_per_channel.append(0.0)
self._weights_scaling_factor[
i.name] = scaling_factor_per_channel
else:
if i.name in self._conv_output_var_name:
op_pos = self.__get_op_index_by_output_var(self.program,
i.name)
cur_op = self.program.current_block().ops[op_pos]
if cur_op.has_attr('fuse_relu') and cur_op.attr(
'fuse_relu'):
max_range = Calibrator.u8_max
self._u8_output_var.append(i.name)
else:
max_range = Calibrator.s8_max
self._s8_output_var.append(i.name)
else:
max_range = self.__get_max_range_by_var_name(self.program,
i.name)
max_value = [[np.abs(np_data)]
for np_data in sampling_data[i.name]]
self._var_max_range[i.name] = max_range
self._var_max_value_map[i.name] = max_value
def __check_force_fp32_attr_by_output_var(self, program, var_name):
for op in program.current_block().ops:
if op.type == "conv2d" and var_name in op.output_arg_names:
return op.attr("force_fp32_output")
return False
def __get_op_index_by_output_var(self, program, var_name, start_index=0):
'''
Check whether the specified input variable is the output of the
conv/pool2d op's output or not.
Returns:
The index if the variable is the output of any conv/pool2d op's
output.
-1 when the variable is not the output of any conv/pool2d op's
output.
'''
for index, op in enumerate(program.current_block().ops[start_index:]):
if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
return index
return -1
def __get_op_index_by_input_var(self, program, var_name, start_index=0):
'''
Get the op index by specified input variable.
Returns:
The op index if the variable is the input of this op or -1 if the
variable is not the input of any op.
'''
for index, op in enumerate(program.current_block().ops[start_index:]):
if var_name in op.input_arg_names:
return index
return -1
def __get_quantize_dequantize_combination(self, program):
"""
Get the quantize/dequantize op index for further inserting.
Args:
The program desc.
Returns:
Two lists contains the quantize op and dequantize op index information.
"""
quantize_op_index = []
dequantize_op_index = []
minimal_conv_count = 2 # there must be two conv ops if not enable the first conv int8.
if len(self._conv_op_index) < minimal_conv_count:
return [], []
for index, value in enumerate(self._conv_op_index):
if index == 0:
quantize_op_index.append(self._conv_op_index[index + 1])
elif index == len(self._conv_op_index) - 1:
output_var = program.current_block().ops[value].output(
"Output")[0]
if self.__check_op_type_with_specified_var_as_input(
program, output_var, index):
dequantize_op_index.append(self._conv_op_index[index] + 2)
else:
program.current_block().ops[value]._set_attr(
"force_fp32_output", True)
elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
1]:
program.current_block().ops[self._conv_op_index[
index]]._set_attr("force_fp32_output", True)
for op_index in range(self._conv_op_index[index + 1],
self._conv_op_index[index], -1):
op_type = program.current_block().ops[op_index].type
op_has_int8_input = False
input_var_name = None
input_length = len(program.current_block().ops[op_index]
.input_arg_names)
for var_name in program.current_block().ops[
op_index].input_arg_names:
if self.__check_var_source_dt(var_name) != -1:
op_has_int8_input = True
input_var_name = var_name
break
if op_has_int8_input:
if op_type == "conv2d":
if program.current_block().ops[op_index +
1].type == "conv2d":
continue
elif program.current_block(
).ops[op_index +
1].type in Calibrator.non_conv_int8_op_type:
dequantize_op_index.append(op_index + 2)
break
else:
program.current_block().ops[op_index]._set_attr(
"force_fp32_output", True)
continue
elif not self.__check_force_fp32_attr_by_output_var(
program, input_var_name
) and op_index not in dequantize_op_index:
share_input_flag = True
for input_attr_name in program.current_block().ops[
op_index].input_names:
input_var_name = program.current_block().ops[
op_index].input(input_attr_name)[0]
cousin_op_index = self.__get_op_index_by_input_var(
program, input_var_name)
if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
share_input_flag = False
break
if share_input_flag:
dequantize_op_index.append(op_index)
elif input_length:
output_is_to_int8_op = False
share_input_flag = True
for var_name in program.current_block().ops[
op_index].input_arg_names:
if not self.__check_op_type_with_specified_var_as_input(
program, var_name):
share_input_flag = False
break
for var_name in program.current_block().ops[
op_index].output_arg_names:
if self.__get_op_index_by_output_var(
program, var_name, op_index) != -1:
output_is_to_int8_op = True
break
if share_input_flag or output_is_to_int8_op:
quantize_op_index.append(op_index)
return quantize_op_index, dequantize_op_index
def __init_analysis(self):
'''
Collect the variable names for sampling.
'''
start_index = 1 #analysis the conv op detail from second conv op.
for i in self._conv_op_index[start_index:]:
self._weights_var_name.append(self.program.current_block().ops[i]
.input('Filter')[0])
self._conv_input_var_name.append(self.program.current_block().ops[i]
.input('Input')[0])
self._conv_output_var_name.append(self.program.current_block().ops[
i].output('Output')[0])
self._int8_output_var_op_index_dict[self.program.current_block()
.ops[i].output('Output')[0]] = i
if self.program.current_block().ops[i].desc.input("ResidualData"):
self._residual_input_var_name.append(self.program.current_block(
).ops[i].desc.input("ResidualData")[0])
if self.program.current_block().ops[i + 1].type == "pool2d":
self._pool2d_output_var_name.append(self.program.current_block(
).ops[i + 1].output('Out')[0])
def __expand_quantized_bins(self, quantized_bins, reference_bins):
expanded_quantized_bins = [0] * len(reference_bins)
num_merged_bins = len(reference_bins) / len(quantized_bins)
j_start = 0
j_end = num_merged_bins
for idx in xrange(len(quantized_bins)):
zero_count = reference_bins[j_start:j_end].count(0)
num_merged_bins = j_end - j_start
if zero_count == num_merged_bins:
avg_bin_ele = 0
else:
avg_bin_ele = quantized_bins[idx] / (
num_merged_bins - zero_count + 0.0)
for idx1 in xrange(j_start, j_end):
expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
else avg_bin_ele)
j_start += num_merged_bins
j_end += num_merged_bins
if (idx + 1) == len(quantized_bins) - 1:
j_end = len(reference_bins)
return expanded_quantized_bins
def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
Q_sum):
'''
Calculate the entropy.
'''
assert len(reference_distr_P) == len(candidate_distr_Q)
tmp_sum1 = 0
tmp_sum2 = 0
for idx in range(len(reference_distr_P)):
p_idx = reference_distr_P[idx]
q_idx = candidate_distr_Q[idx]
if p_idx == 0:
tmp_sum1 += 0
tmp_sum2 += 0
else:
if q_idx == 0:
print("Fatal error!, idx = " + str(idx) +
" qindex = 0! p_idx = " + str(p_idx))
tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
return (tmp_sum1 - tmp_sum2) / P_sum
# Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
def __get_optimal_scaling_factor(self,
activation_blob,
num_quantized_bins=255):
'''
Using the KL-divergenc method to get the more precise scaling factor.
'''
max_val = np.max(activation_blob)
min_val = np.min(activation_blob)
if min_val >= 0:
hist, hist_edeges = np.histogram(
activation_blob, bins=2048, range=(min_val, max_val))
ending_iter = 2047
starting_iter = int(ending_iter * 0.7)
else:
th = max(abs(max_val), abs(min_val))
hist, hist_edeges = np.histogram(
activation_blob, bins=2048, range=(-th, th))
starting_iter = 0
ending_iter = 2047
if abs(max_val) > abs(min_val):
while starting_iter < ending_iter:
if hist[starting_iter] == 0:
starting_iter += 1
continue
else:
break
starting_iter += int((ending_iter - starting_iter) * 0.6)
else:
while ending_iter > 0:
if hist[ending_iter] == 0:
ending_iter -= 1
continue
else:
break
starting_iter = int(0.6 * ending_iter)
bin_width = hist_edeges[1] - hist_edeges[0]
P_sum = len(np.array(activation_blob).ravel())
min_kl_divergence = 0
min_kl_index = 0
kl_inited = False
for i in range(starting_iter, ending_iter + 1):
reference_distr_P = hist[0:i].tolist()
outliers_count = sum(hist[i:2048])
if reference_distr_P[i - 1] == 0:
continue
reference_distr_P[i - 1] += outliers_count
reference_distr_bins = reference_distr_P[:]
candidate_distr_Q = hist[0:i].tolist()
num_merged_bins = i / num_quantized_bins
candidate_distr_Q_quantized = [0] * num_quantized_bins
j_start = 0
j_end = num_merged_bins
for idx in xrange(num_quantized_bins):
candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
j_start:j_end])
j_start += num_merged_bins
j_end += num_merged_bins
if (idx + 1) == num_quantized_bins - 1:
j_end = i
candidate_distr_Q = self.__expand_quantized_bins(
candidate_distr_Q_quantized, reference_distr_bins)
Q_sum = sum(candidate_distr_Q)
kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
candidate_distr_Q, Q_sum)
if not kl_inited:
min_kl_divergence = kl_divergence
min_kl_index = i
kl_inited = True
elif kl_divergence < min_kl_divergence:
min_kl_divergence = kl_divergence
min_kl_index = i
else:
pass
if min_kl_index == 0:
while starting_iter > 0:
if hist[starting_iter] == 0:
starting_iter -= 1
continue
else:
break
min_kl_index = starting_iter
return (min_kl_index + 0.5) * bin_width
@staticmethod
def __dot(program, output_name="model.dot"):
'''
Generate the graphiz dot file for debugging.
'''
dot_graph = ""
dot_nodes = []
dot_edges = []
dot_graph += "digraph pm {\n"
for block in program.blocks:
ops = list(block.ops)
for index, op in enumerate(ops):
op_type = op.type
op_name = op_type + "_" + op.output_arg_names[0].replace(
".", "_") + "___" + str(index)
for name in op.input_arg_names:
name = name.replace(".", "_")
dot_edge = name + " -> " + op_name
if dot_edge not in dot_edges:
dot_edges.append(dot_edge)
dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
if dot_node not in dot_nodes:
dot_nodes.append(dot_node)
for name in op.output_arg_names:
name = name.replace(".", "_")
dot_edge = op_name + " -> " + name
if dot_edge not in dot_edges:
dot_edges.append(dot_edge)
if op_type in Calibrator.supported_int8_op_type:
if op_type == "conv2d" and op.has_attr(
'force_fp32_output') and op.attr(
"force_fp32_output"):
dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
else:
dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
elif op_type in ["quantize", "dequantize"]:
dot_node = op_name + " [shape=box, style=filled, color=gold]"
else:
dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
if dot_node not in dot_nodes:
dot_nodes.append(dot_node)
for dot_edge in dot_edges:
dot_graph += dot_edge + "\n"
for dot_node in dot_nodes:
dot_graph += dot_node + "\n"
dot_graph += "}"
with open(output_name, 'w') as f:
f.write(dot_graph)
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
if(APPLE OR WIN32 OR NOT WITH_MKL)
list(REMOVE_ITEM TEST_OPS test_calibration_resnet50)
list(REMOVE_ITEM TEST_OPS test_calibration_mobilenetv1)
endif()
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
if(src MATCHES "test_calibration_*")
py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI})
else()
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
endif()
endforeach() endforeach()
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import sys
from test_calibration_resnet50 import TestCalibration
class TestCalibrationForMobilenetv1(TestCalibration):
def download_model(self):
# mobilenetv1 fp32 data
data_urls = [
'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
]
data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
"mobilenetv1_fp32")
self.model = "MobileNet-V1"
self.algo = "KL"
def test_calibration(self):
self.download_model()
print("Start FP32 inference for {0} on {1} images ...".format(
self.model, self.infer_iterations * self.batch_size))
(fp32_throughput, fp32_latency,
fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
print("Start INT8 calibration for {0} on {1} images ...".format(
self.model, self.sample_iterations * self.batch_size))
self.run_program(
self.model_cache_folder + "/model", True, algo=self.algo)
print("Start INT8 inference for {0} on {1} images ...".format(
self.model, self.infer_iterations * self.batch_size))
(int8_throughput, int8_latency,
int8_acc1) = self.run_program(self.int8_model)
delta_value = fp32_acc1 - int8_acc1
self.assertLess(delta_value, 0.01)
print(
"FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, fp32_throughput, fp32_latency,
fp32_acc1))
print(
"INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, int8_throughput, int8_latency,
int8_acc1))
sys.stdout.flush()
if __name__ == '__main__':
unittest.main()
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import os
import numpy as np
import time
import sys
import random
import paddle
import paddle.fluid as fluid
import functools
import contextlib
from paddle.dataset.common import download
from PIL import Image, ImageEnhance
import math
import paddle.fluid.contrib.int8_inference.utility as int8_utility
random.seed(0)
np.random.seed(0)
DATA_DIM = 224
THREAD = 1
BUF_SIZE = 102400
DATA_DIR = 'data/ILSVRC2012'
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
# TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = np.random.randint(0, width - size + 1)
h_start = np.random.randint(0, height - size + 1)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def process_image(sample, mode, color_jitter, rotate):
img_path = sample[0]
img = Image.open(img_path)
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
return img, sample[1]
def _reader_creator(file_list,
mode,
shuffle=False,
color_jitter=False,
rotate=False,
data_dir=DATA_DIR):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
np.random.shuffle(full_lines)
lines = full_lines
for line in lines:
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
if not os.path.exists(img_path):
continue
yield img_path, int(label)
mapper = functools.partial(
process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
def val(data_dir=DATA_DIR):
file_list = os.path.join(data_dir, 'val_list.txt')
return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
class TestCalibration(unittest.TestCase):
def setUp(self):
self.int8_download = 'int8/download'
self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
self.int8_download)
data_urls = []
data_md5s = []
self.data_cache_folder = ''
if os.environ.get('DATASET') == 'full':
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
)
data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
)
data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
self.data_cache_folder = self.download_data(data_urls, data_md5s,
"full_data", False)
else:
data_urls.append(
'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
)
data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
self.data_cache_folder = self.download_data(data_urls, data_md5s,
"small_data", False)
# reader/decorator.py requires the relative path to the data folder
cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
self.data_cache_folder)
os.system(cmd)
self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
self.sample_iterations = 50 if os.environ.get(
'DATASET') == 'full' else 1
self.infer_iterations = 50000 if os.environ.get(
'DATASET') == 'full' else 1
self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
self.int8_model = ''
def tearDown(self):
try:
os.system("rm -rf {}".format(self.int8_model))
except Exception as e:
print("Failed to delete {} due to {}".format(self.int8_model,
str(e)))
def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
zip_path)
os.system(cmd)
def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
data_cache_folder = os.path.join(self.cache_folder, folder_name)
zip_path = ''
if os.environ.get('DATASET') == 'full':
file_names = []
for i in range(0, len(data_urls)):
download(data_urls[i], self.int8_download, data_md5s[i])
file_names.append(data_urls[i].split('/')[-1])
zip_path = os.path.join(self.cache_folder,
'full_imagenet_val.tar.gz')
if not os.path.exists(zip_path):
cat_command = 'cat'
for file_name in file_names:
cat_command += ' ' + os.path.join(self.cache_folder,
file_name)
cat_command += ' > ' + zip_path
os.system(cat_command)
if os.environ.get('DATASET') != 'full' or is_model:
download(data_urls[0], self.int8_download, data_md5s[0])
file_name = data_urls[0].split('/')[-1]
zip_path = os.path.join(self.cache_folder, file_name)
print('Data is downloaded at {0}'.format(zip_path))
self.cache_unzipping(data_cache_folder, zip_path)
return data_cache_folder
def download_model(self):
pass
def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224]
fluid.memory_optimize(fluid.default_main_program())
exe = fluid.Executor(fluid.CPUPlace())
[infer_program, feed_dict,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
t = fluid.transpiler.InferenceTranspiler()
t.transpile(infer_program, fluid.CPUPlace())
val_reader = paddle.batch(val(), self.batch_size)
iterations = self.infer_iterations
if generate_int8:
self.int8_model = os.path.join(os.getcwd(),
"calibration_out_" + self.timestamp)
iterations = self.sample_iterations
try:
os.system("mkdir " + self.int8_model)
except Exception as e:
print("Failed to create {} due to {}".format(self.int8_model,
str(e)))
sys.exit(-1)
calibrator = int8_utility.Calibrator(
program=infer_program,
pretrained_model=model_path,
algo=algo,
exe=exe,
output=self.int8_model,
feed_var_names=feed_dict,
fetch_list=fetch_targets)
test_info = []
cnt = 0
periods = []
for batch_id, data in enumerate(val_reader()):
image = np.array(
[x[0].reshape(image_shape) for x in data]).astype("float32")
label = np.array([x[1] for x in data]).astype("int64")
label = label.reshape([-1, 1])
running_program = calibrator.sampling_program.clone(
) if generate_int8 else infer_program.clone()
t1 = time.time()
_, acc1, _ = exe.run(
running_program,
feed={feed_dict[0]: image,
feed_dict[1]: label},
fetch_list=fetch_targets)
t2 = time.time()
period = t2 - t1
periods.append(period)
if generate_int8:
calibrator.sample_data()
test_info.append(np.mean(acc1) * len(data))
cnt += len(data)
if (batch_id + 1) % 100 == 0:
print("{0} images,".format(batch_id + 1))
sys.stdout.flush()
if (batch_id + 1) == iterations:
break
if generate_int8:
calibrator.save_int8_model()
print(
"Calibration is done and the corresponding files are generated at {}".
format(os.path.abspath("calibration_out")))
else:
throughput = cnt / np.sum(periods)
latency = np.average(periods)
acc1 = np.sum(test_info) / cnt
return (throughput, latency, acc1)
class TestCalibrationForResnet50(TestCalibration):
def download_model(self):
# resnet50 fp32 data
data_urls = [
'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
]
data_md5s = ['4a5194524823d9b76da6e738e1367881']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
"resnet50_fp32")
self.model = "ResNet-50"
self.algo = "direct"
def test_calibration(self):
self.download_model()
print("Start FP32 inference for {0} on {1} images ...".format(
self.model, self.infer_iterations * self.batch_size))
(fp32_throughput, fp32_latency,
fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
print("Start INT8 calibration for {0} on {1} images ...".format(
self.model, self.sample_iterations * self.batch_size))
self.run_program(
self.model_cache_folder + "/model", True, algo=self.algo)
print("Start INT8 inference for {0} on {1} images ...".format(
self.model, self.infer_iterations * self.batch_size))
(int8_throughput, int8_latency,
int8_acc1) = self.run_program(self.int8_model)
delta_value = fp32_acc1 - int8_acc1
self.assertLess(delta_value, 0.01)
print(
"FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, fp32_throughput, fp32_latency,
fp32_acc1))
print(
"INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, int8_throughput, int8_latency,
int8_acc1))
sys.stdout.flush()
if __name__ == '__main__':
unittest.main()
...@@ -110,7 +110,6 @@ packages=['paddle', ...@@ -110,7 +110,6 @@ packages=['paddle',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.int8_inference',
'paddle.fluid.contrib.reader', 'paddle.fluid.contrib.reader',
'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.core',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册