Remove all the code, API and doc of MKL-DNN INT8v1 (#18347)

19da59ed · 翟飞跃 · Tao Luo · 8ed33bf9 · 19da59ed · 19da59ed
9 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -403,9 +403,6 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
-paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
-paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5'))
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
 paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0'))

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,8 +22,6 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
-from . import int8_inference
-from .int8_inference import *
 from . import reader
 from .reader import *
 from . import slim
@@ -44,7 +42,6 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
-__all__ += int8_inference.__all__
 __all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__

--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
-# Offline INT8 Calibration Tool
-
-PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy.
-
-## 0. Prerequisite
-You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
-
-## 1. How to generate INT8 model
-You can refer to the unit test in [test_calibration_resnet50.py](../tests/test_calibration_resnet50.py). Basically, there are three steps:
-* Construct calibration object.
-
-```python
-calibrator = int8_utility.Calibrator( # Step 1
-    program=infer_program, # required, FP32 program
-    pretrained_model=model_path, # required, FP32 pretrained model
-    algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence)
-    exe=exe, # required, executor
-    output=int8_model, # required, INT8 model
-    feed_var_names=feed_dict, # required, feed dict
-    fetch_list=fetch_targets) # required, fetch targets
-```
-
-* Call the calibrator.sample_data() after executor run.
-```python
-_, acc1, _ = exe.run(
-    program,
-    feed={feed_dict[0]: image,
-          feed_dict[1]: label},
-    fetch_list=fetch_targets)
-
-calibrator.sample_data() # Step 2
-```
-
-* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50)
-```python
-calibrator.save_int8_model() # Step 3
-```
-
-## 2. How to run INT8 model
-You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32").
-
-```python
-[infer_program, feed_dict,
-    fetch_targets] = fluid.io.load_inference_model(model_path, exe)
-```
-
-## 3. Result
-We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 (single core).
-
-**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
-
-| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
-| :------------: | :------------: | :------------: | :------------: | :------------: |
-| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.23%  | 0.40% |
-| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.47%  | 0.31%  |
-
-**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
-
-| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
-| :------------: | :------------: | :------------: | :------------: | :------------: |
-| ResNet-50  | Full ImageNet Val  |  11.54 images/s | 32.2 images/s | 2.79 |
-| MobileNet-V1 | Full ImageNet Val  | 49.21 images/s | 108.37 images/s | 2.2  |
-
-Please note that [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset") can be downloaded by script `test_calibration.py` with `DATASET=full`. 
-
-Notes:
-* The accuracy measurement requires the model with `label`.
-* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `The theoretical peak compute gains are 4x int8 OPS over fp32 OPS.` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). Therefore, op-level gain is 4X and topology-level is smaller.
-
-## 4. How to reproduce the results
-* Small dataset for ResNet-50 (Single core)
-```bash
-FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
-```
->Note: Change `test_calibration_resnet50.py` to `test_calibration_mobilenetv1.py` for MobileNet-V1. Same for the following commands.
-
-* Full dataset for ResNet-50 (Single core)
-```bash
-FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
-```
-
-* Full dataset for ResNet-50 (Multi-core)
-```bash
-FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
-```
-> Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
--- a/python/paddle/fluid/contrib/int8_inference/__init__.py
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from . import utility
-from .utility import *
-
-__all__ = utility.__all__
--- a/python/paddle/fluid/contrib/int8_inference/utility.py
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid import core
-import numpy as np
-import math
-import os
-from paddle.fluid.executor import global_scope
-from paddle.fluid import io
-
-__all__ = ['Calibrator']
-
-
-class Calibrator(object):
-    '''
-    The calibrator class transforms the program and updates the calculated scale into it.
-    This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
-    '''
-    # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
-    non_conv_int8_op_type = ("pool2d")
-    supported_int8_op_type = ("conv2d", "pool2d")
-    const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
-    u8_max = 255
-    s8_max = 127
-
-    def __init__(self, *args, **kwargs):
-        self.program = kwargs['program']
-        self.pretrained_model = kwargs['pretrained_model']
-        self.debug = kwargs['debug'] if 'debug' in kwargs else False
-        self.algo = kwargs['algo']
-        self.output = kwargs['output']
-        self.feed_var_names = kwargs['feed_var_names']
-        self.fetch_list = kwargs['fetch_list']
-        self.exe = kwargs['exe']
-
-        self._conv_input_var_name = []
-        self._conv_output_var_name = []
-        self._pool2d_output_var_name = []
-        self._weights_var_name = []
-        self._residual_input_var_name = []
-        self._int8_output_var_op_index_dict = {}
-        self._conv_op_index = [
-            index for index, value in enumerate(self.program.global_block().ops)
-            if value.type == 'conv2d'
-        ]
-
-        self._var_max_value_map = {}
-        self._var_max_range = {}
-        self._weights_scaling_factor = {}
-        self._u8_output_var = []
-        self._s8_output_var = []
-        self._persistable_vars = []
-        self._sampling_data = {}
-
-        self.__init_analysis()
-        self.__generate_output_program()
-
-    def save_int8_model(self):
-        self.__sampling(self._sampling_data)
-        self.__save_scale()
-        self.__update_program()
-        self.__update_output_program_attr()
-        self.__display_debug()
-        self.__save_offline_model()
-
-    def sample_data(self):
-        '''
-        Sampling the tensor data of variable.
-        '''
-        for i in self.sampling_program.list_vars():
-            if i.name in self.sampling_vars:
-                np_data = np.array(global_scope().find_var(i.name).get_tensor())
-                if i.name not in self._sampling_data:
-                    self._sampling_data[i.name] = []
-                self._sampling_data[i.name].append(np_data)
-
-    def __save_offline_model(self):
-        '''
-        Save the quantized model to the disk.
-        '''
-        io.save_inference_model(self.output, self.feed_var_names,
-                                self.fetch_list, self.exe,
-                                self.sampling_program)
-
-    def __display_debug(self):
-        if self.debug:
-            self.__dot(self._output_program)
-            print(self._output_program)
-
-    def __get_max_range_by_var_name(self, program, var_name):
-        """
-        Check the specified variable was generated from Relu layer or not.
-        If the variable was the output of one of the pool2d/reshape/concat
-        /transpose, we keep trace the ancestor of this variable;
-        If the variable was the output the conv op, we check it's has_relu
-        attr;
-        Otherwise, we return the Calibrator.s8 as default value.
-        Returns:
-            Return Calibrator.u8_max if the variable was generated by Relu,
-            otherwise it will returns Calibrator.s8
-        """
-        search_end_index = -1
-        input_index_name = {}
-        output_index_name = {}
-        ops_type = []
-
-        for index, op in enumerate(program.current_block().ops):
-            ops_type.append(op.type)
-
-            input_index_name[index] = op.input_arg_names
-
-            output_index_name[index] = op.output_arg_names
-            if var_name in op.output_arg_names:
-                search_end_index = index
-
-        # analysis
-        while search_end_index >= 0:
-            if ops_type[search_end_index] == "relu":
-                return Calibrator.u8_max
-
-            input_name = input_index_name[search_end_index][0]
-
-            for i in output_index_name.keys():
-                if input_name in output_index_name[i]:
-                    search_end_index = i
-                    break
-
-            if ops_type[
-                    search_end_index] not in Calibrator.const_sign_op_type and ops_type[
-                        search_end_index] != 'conv2d':
-                return Calibrator.s8_max
-
-            if ops_type[search_end_index] != 'conv2d':
-                continue
-
-            if program.current_block().ops[search_end_index].has_attr(
-                    'fuse_relu') and program.current_block().ops[
-                        search_end_index].attr('fuse_relu'):
-                return Calibrator.u8_max
-            else:
-                return Calibrator.s8_max
-
-        return Calibrator.s8_max
-
-    def __check_op_type_with_specified_var_as_input(self,
-                                                    program,
-                                                    var_name,
-                                                    start_index=0):
-        '''
-        Check whether all the type of ops that use the specified variable as the
-        input.If one of those op is not int8-enabled, return False.
-        '''
-        op_type_list = [
-            op.type for op in program.current_block().ops[start_index:]
-            if var_name in op.input_arg_names
-        ]
-        for i in op_type_list:
-            if not i in Calibrator.supported_int8_op_type:
-                return False
-        return True
-
-    def __check_var_source_dt(self, var_name):
-        '''
-        Check whether the specified variable is the output of int8 conv op or not.
-        If true, return the original op index.
-        If false, return -1
-        '''
-        return self._int8_output_var_op_index_dict[
-            var_name] if var_name in self._int8_output_var_op_index_dict else -1
-
-    def __update_int8_output_var_op_index_dict(self, index, var_name=None):
-        '''
-        Update the int8_output_variable/op_index dictionary
-        '''
-        for k, v in self._int8_output_var_op_index_dict.items():
-            if v >= index:
-                self._int8_output_var_op_index_dict[k] = v + 1
-        if var_name:
-            self._int8_output_var_op_index_dict[var_name] = index
-
-    def __update_program(self):
-        '''
-        Update the program with the quantize/dequantize op insertion.
-        '''
-        quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
-            self._output_program)
-        inserted_op_length = 0
-        calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
-        insert_op_collection = sorted(quantize_index + dequantize_index)
-
-        for index in insert_op_collection:
-            if index in quantize_index:
-                quantize_tmp = self._output_program.current_block().create_var(
-                    name="quantize_{}_tmp".format(index),
-                    dtype=core.VarDesc.VarType.UINT8)
-                original_out_name = self._output_program.current_block().ops[
-                    index + inserted_op_length - 1].output_names[0]
-                original_out = self._output_program.current_block().ops[
-                    index + inserted_op_length - 1].output(original_out_name)[0]
-
-                op = self._output_program.current_block()._insert_op(
-                    index=index + inserted_op_length,
-                    type="quantize",
-                    inputs={"Input": original_out},
-                    outputs={"Output": quantize_tmp}, )
-
-                op._set_attr("data_format", "MKLDNNLAYOUT")
-                op._set_attr("use_mkldnn", 1)
-                op._set_attr(
-                    "Scale", self._var_max_range[original_out] /
-                    calc_max_func(self._var_max_value_map[original_out]))
-
-                if self.__get_max_range_by_var_name(
-                        self._output_program,
-                        original_out) == Calibrator.s8_max:
-                    op._set_attr("is_negative_input", 1)
-
-                self.__update_int8_output_var_op_index_dict(
-                    index + inserted_op_length, "quantize_{}_tmp".format(index))
-
-                inserted_op_length += 1
-                for op in self._output_program.current_block().ops[
-                        index + inserted_op_length:]:
-                    for j in op.input_names:
-                        if op.input(j) and op.input(
-                                j
-                        )[0] == original_out and op.type in Calibrator.supported_int8_op_type:
-                            op.desc.set_input(j,
-                                              ["{}".format(quantize_tmp.name)])
-            else:
-                start_index = index + inserted_op_length
-                dequantize_tmp_var = self._output_program.current_block(
-                ).create_var(
-                    name="dequantize_{}_tmp".format(index + 1),
-                    dtype="float32", )
-                original_out_var = None
-
-                for original_input in self._output_program.current_block().ops[
-                        start_index].input_arg_names:
-                    index_res = self.__get_op_index_by_output_var(
-                        self._output_program, original_input)
-                    if index_res != -1:
-                        original_out_var = original_input
-                        break
-
-                if original_out_var:
-                    op = self._output_program.current_block()._insert_op(
-                        index=start_index,
-                        type="dequantize",
-                        inputs={"Input": original_out_var},
-                        outputs={"Output": dequantize_tmp_var})
-                    op._set_attr("data_format", "MKLDNNLAYOUT")
-                    op._set_attr("use_mkldnn", 1)
-                    op._set_attr("Scale", self._var_max_range[original_out_var]
-                                 / calc_max_func(self._var_max_value_map[
-                                     original_out_var]))
-
-                    for op_index in range(
-                            start_index + 1,
-                            len(self._output_program.current_block().ops)):
-                        if self._output_program.current_block(
-                        ).ops[op_index].type == "conv2d" and self._output_program.current_block(
-                        ).ops[op_index].attr("force_fp32_output"):
-                            continue
-                        else:
-                            for j in self._output_program.current_block().ops[
-                                    op_index].input_names:
-                                if len(self._output_program.current_block().ops[
-                                        op_index].input(j)
-                                       ) and self._output_program.current_block(
-                                       ).ops[op_index].input(j)[
-                                           0] == original_out_var:
-                                    self._output_program.current_block(
-                                    ).ops[op_index].desc.set_input(
-                                        j,
-                                        ["{}".format(dequantize_tmp_var.name)])
-
-                    inserted_op_length += 1
-
-                    op._set_attr("data_format", "MKLDNNLAYOUT")
-                    op._set_attr("use_mkldnn", 1)
-
-    def __update_output_program_attr(self):
-        for i in self._output_program.list_vars():
-            if i.name in self._persistable_vars:
-                i.persistable = False
-                os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
-
-        for i in self._u8_output_var:
-            self._output_program.current_block().var(i).desc.set_dtype(
-                core.VarDesc.VarType.UINT8)
-
-        for i in self._s8_output_var:
-            self._output_program.current_block().var(i).desc.set_dtype(
-                core.VarDesc.VarType.INT8)
-
-    @property
-    def sampling_program(self):
-        return self._output_program
-
-    @property
-    def sampling_vars(self):
-        return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
-
-    def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
-        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
-
-    def __generate_output_program(self):
-        for i in self.program.list_vars():
-            if not i.persistable and i.name in self.sampling_vars:
-                i.persistable = True
-                self._persistable_vars.append(i.name)
-
-        self._output_program = self.program.clone()
-
-    def __save_scale(self):
-        '''
-        Update the convolution scale information.
-        '''
-        func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
-        for i in self._conv_op_index[1:]:
-            weights_var_name = self.program.current_block().ops[i].input(
-                'Filter')[0]
-            input_var_name = self.program.current_block().ops[i].input('Input')[
-                0]
-            output_var_name = self.program.current_block().ops[i].output(
-                'Output')[0]
-            self._output_program.current_block().ops[i]._set_attr(
-                "Scale_weights", self._weights_scaling_factor[weights_var_name])
-
-            self._output_program.current_block().ops[i]._set_attr(
-                "Scale_in", self._var_max_range[input_var_name] /
-                func(self._var_max_value_map[input_var_name]))
-            self._output_program.current_block().ops[i]._set_attr(
-                "Scale_out", self._var_max_range[output_var_name] /
-                func(self._var_max_value_map[output_var_name]))
-            if self._output_program.current_block().ops[i].desc.input(
-                    "ResidualData"):
-                residual_var_name = self._output_program.current_block().ops[
-                    i].desc.input("ResidualData")[0]
-                self._output_program.current_block().ops[i]._set_attr(
-                    "Scale_in_eltwise", self._var_max_range[residual_var_name] /
-                    func(self._var_max_value_map[residual_var_name]))
-
-    def __sampling(self, sampling_data):
-        '''
-        Sampling the variables data range.
-        '''
-        for i in self.program.list_vars():
-            if i.name not in self.sampling_vars:
-                continue
-
-            if i.name in self._weights_var_name:
-                scaling_factor_per_channel = []
-                data = sampling_data[i.name][0]
-                for j in range(data.shape[0]):
-                    var_value = float(np.max(np.abs(data[j])))
-                    if not self._is_close(var_value, 0.0):
-                        scaling_factor_per_channel.append(Calibrator.s8_max /
-                                                          var_value)
-                    else:
-                        scaling_factor_per_channel.append(0.0)
-                self._weights_scaling_factor[
-                    i.name] = scaling_factor_per_channel
-            else:
-                if i.name in self._conv_output_var_name:
-                    op_pos = self.__get_op_index_by_output_var(self.program,
-                                                               i.name)
-                    cur_op = self.program.current_block().ops[op_pos]
-
-                    if cur_op.has_attr('fuse_relu') and cur_op.attr(
-                            'fuse_relu'):
-                        max_range = Calibrator.u8_max
-                        self._u8_output_var.append(i.name)
-                    else:
-                        max_range = Calibrator.s8_max
-                        self._s8_output_var.append(i.name)
-                else:
-                    max_range = self.__get_max_range_by_var_name(self.program,
-                                                                 i.name)
-                max_value = [[np.abs(np_data)]
-                             for np_data in sampling_data[i.name]]
-
-                self._var_max_range[i.name] = max_range
-                self._var_max_value_map[i.name] = max_value
-
-    def __check_force_fp32_attr_by_output_var(self, program, var_name):
-        for op in program.current_block().ops:
-            if op.type == "conv2d" and var_name in op.output_arg_names:
-                return op.attr("force_fp32_output")
-        return False
-
-    def __get_op_index_by_output_var(self, program, var_name, start_index=0):
-        '''
-        Check whether the specified input variable is the output of the
-        conv/pool2d op's output or not.
-
-        Returns:
-            The index if the variable is the output of any conv/pool2d op's
-            output.
-            -1 when the variable is not the output of any conv/pool2d op's 
-            output.
-        '''
-        for index, op in enumerate(program.current_block().ops[start_index:]):
-            if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
-                return index
-        return -1
-
-    def __get_op_index_by_input_var(self, program, var_name, start_index=0):
-        '''
-        Get the op index by specified input variable.
-        Returns:
-            The op index if the variable is the input of this op or -1 if the 
-            variable is not the input of any op. 
-        '''
-        for index, op in enumerate(program.current_block().ops[start_index:]):
-            if var_name in op.input_arg_names:
-                return index
-
-        return -1
-
-    def __get_quantize_dequantize_combination(self, program):
-        """
-        Get the quantize/dequantize op index for further inserting.
-        Args:
-            The program desc.
-        Returns:
-            Two lists contains the quantize op and dequantize op index information.
-        """
-        quantize_op_index = []
-        dequantize_op_index = []
-        minimal_conv_count = 2  # there must be two conv ops if not enable the first conv int8.
-        if len(self._conv_op_index) < minimal_conv_count:
-            return [], []
-
-        for index, value in enumerate(self._conv_op_index):
-            if index == 0:
-                quantize_op_index.append(self._conv_op_index[index + 1])
-            elif index == len(self._conv_op_index) - 1:
-                output_var = program.current_block().ops[value].output(
-                    "Output")[0]
-                if self.__check_op_type_with_specified_var_as_input(
-                        program, output_var, index):
-                    dequantize_op_index.append(self._conv_op_index[index] + 2)
-                else:
-                    program.current_block().ops[value]._set_attr(
-                        "force_fp32_output", True)
-
-            elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
-                                                                      1]:
-
-                program.current_block().ops[self._conv_op_index[
-                    index]]._set_attr("force_fp32_output", True)
-
-                for op_index in range(self._conv_op_index[index + 1],
-                                      self._conv_op_index[index], -1):
-                    op_type = program.current_block().ops[op_index].type
-                    op_has_int8_input = False
-                    input_var_name = None
-                    input_length = len(program.current_block().ops[op_index]
-                                       .input_arg_names)
-
-                    for var_name in program.current_block().ops[
-                            op_index].input_arg_names:
-                        if self.__check_var_source_dt(var_name) != -1:
-                            op_has_int8_input = True
-                            input_var_name = var_name
-                            break
-
-                    if op_has_int8_input:
-                        if op_type == "conv2d":
-                            if program.current_block().ops[op_index +
-                                                           1].type == "conv2d":
-                                continue
-                            elif program.current_block(
-                            ).ops[op_index +
-                                  1].type in Calibrator.non_conv_int8_op_type:
-                                dequantize_op_index.append(op_index + 2)
-                                break
-                            else:
-                                program.current_block().ops[op_index]._set_attr(
-                                    "force_fp32_output", True)
-                                continue
-                        elif not self.__check_force_fp32_attr_by_output_var(
-                                program, input_var_name
-                        ) and op_index not in dequantize_op_index:
-                            share_input_flag = True
-                            for input_attr_name in program.current_block().ops[
-                                    op_index].input_names:
-                                input_var_name = program.current_block().ops[
-                                    op_index].input(input_attr_name)[0]
-                                cousin_op_index = self.__get_op_index_by_input_var(
-                                    program, input_var_name)
-                                if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
-                                    share_input_flag = False
-                                    break
-                            if share_input_flag:
-                                dequantize_op_index.append(op_index)
-
-                    elif input_length:
-                        output_is_to_int8_op = False
-                        share_input_flag = True
-                        for var_name in program.current_block().ops[
-                                op_index].input_arg_names:
-                            if not self.__check_op_type_with_specified_var_as_input(
-                                    program, var_name):
-                                share_input_flag = False
-                                break
-
-                        for var_name in program.current_block().ops[
-                                op_index].output_arg_names:
-                            if self.__get_op_index_by_output_var(
-                                    program, var_name, op_index) != -1:
-                                output_is_to_int8_op = True
-                                break
-
-                        if share_input_flag or output_is_to_int8_op:
-                            quantize_op_index.append(op_index)
-
-        return quantize_op_index, dequantize_op_index
-
-    def __init_analysis(self):
-        '''
-        Collect the variable names for sampling.
-        '''
-        start_index = 1  #analysis the conv op detail from second conv op.
-
-        for i in self._conv_op_index[start_index:]:
-            self._weights_var_name.append(self.program.current_block().ops[i]
-                                          .input('Filter')[0])
-            self._conv_input_var_name.append(self.program.current_block().ops[i]
-                                             .input('Input')[0])
-            self._conv_output_var_name.append(self.program.current_block().ops[
-                i].output('Output')[0])
-            self._int8_output_var_op_index_dict[self.program.current_block()
-                                                .ops[i].output('Output')[0]] = i
-            if self.program.current_block().ops[i].desc.input("ResidualData"):
-                self._residual_input_var_name.append(self.program.current_block(
-                ).ops[i].desc.input("ResidualData")[0])
-
-            if self.program.current_block().ops[i + 1].type == "pool2d":
-                self._pool2d_output_var_name.append(self.program.current_block(
-                ).ops[i + 1].output('Out')[0])
-
-    def __expand_quantized_bins(self, quantized_bins, reference_bins):
-        expanded_quantized_bins = [0] * len(reference_bins)
-        num_merged_bins = len(reference_bins) / len(quantized_bins)
-        j_start = 0
-        j_end = num_merged_bins
-        for idx in xrange(len(quantized_bins)):
-            zero_count = reference_bins[j_start:j_end].count(0)
-            num_merged_bins = j_end - j_start
-            if zero_count == num_merged_bins:
-                avg_bin_ele = 0
-            else:
-                avg_bin_ele = quantized_bins[idx] / (
-                    num_merged_bins - zero_count + 0.0)
-            for idx1 in xrange(j_start, j_end):
-                expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
-                                                 else avg_bin_ele)
-            j_start += num_merged_bins
-            j_end += num_merged_bins
-            if (idx + 1) == len(quantized_bins) - 1:
-                j_end = len(reference_bins)
-        return expanded_quantized_bins
-
-    def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
-                       Q_sum):
-        '''
-        Calculate the entropy.
-        '''
-        assert len(reference_distr_P) == len(candidate_distr_Q)
-        tmp_sum1 = 0
-        tmp_sum2 = 0
-        for idx in range(len(reference_distr_P)):
-            p_idx = reference_distr_P[idx]
-            q_idx = candidate_distr_Q[idx]
-            if p_idx == 0:
-                tmp_sum1 += 0
-                tmp_sum2 += 0
-            else:
-                if q_idx == 0:
-                    print("Fatal error!, idx = " + str(idx) +
-                          " qindex = 0! p_idx = " + str(p_idx))
-                tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
-                tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
-        return (tmp_sum1 - tmp_sum2) / P_sum
-
-    # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
-    def __get_optimal_scaling_factor(self,
-                                     activation_blob,
-                                     num_quantized_bins=255):
-        '''
-        Using the KL-divergenc method to get the more precise scaling factor.
-        '''
-        max_val = np.max(activation_blob)
-        min_val = np.min(activation_blob)
-        if min_val >= 0:
-            hist, hist_edeges = np.histogram(
-                activation_blob, bins=2048, range=(min_val, max_val))
-            ending_iter = 2047
-            starting_iter = int(ending_iter * 0.7)
-        else:
-            th = max(abs(max_val), abs(min_val))
-            hist, hist_edeges = np.histogram(
-                activation_blob, bins=2048, range=(-th, th))
-            starting_iter = 0
-            ending_iter = 2047
-            if abs(max_val) > abs(min_val):
-                while starting_iter < ending_iter:
-                    if hist[starting_iter] == 0:
-                        starting_iter += 1
-                        continue
-                    else:
-                        break
-                starting_iter += int((ending_iter - starting_iter) * 0.6)
-            else:
-                while ending_iter > 0:
-                    if hist[ending_iter] == 0:
-                        ending_iter -= 1
-                        continue
-                    else:
-                        break
-                starting_iter = int(0.6 * ending_iter)
-        bin_width = hist_edeges[1] - hist_edeges[0]
-
-        P_sum = len(np.array(activation_blob).ravel())
-        min_kl_divergence = 0
-        min_kl_index = 0
-        kl_inited = False
-        for i in range(starting_iter, ending_iter + 1):
-            reference_distr_P = hist[0:i].tolist()
-            outliers_count = sum(hist[i:2048])
-            if reference_distr_P[i - 1] == 0:
-                continue
-            reference_distr_P[i - 1] += outliers_count
-            reference_distr_bins = reference_distr_P[:]
-            candidate_distr_Q = hist[0:i].tolist()
-            num_merged_bins = i / num_quantized_bins
-            candidate_distr_Q_quantized = [0] * num_quantized_bins
-            j_start = 0
-            j_end = num_merged_bins
-            for idx in xrange(num_quantized_bins):
-                candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
-                    j_start:j_end])
-                j_start += num_merged_bins
-                j_end += num_merged_bins
-                if (idx + 1) == num_quantized_bins - 1:
-                    j_end = i
-            candidate_distr_Q = self.__expand_quantized_bins(
-                candidate_distr_Q_quantized, reference_distr_bins)
-            Q_sum = sum(candidate_distr_Q)
-            kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
-                                                candidate_distr_Q, Q_sum)
-            if not kl_inited:
-                min_kl_divergence = kl_divergence
-                min_kl_index = i
-                kl_inited = True
-            elif kl_divergence < min_kl_divergence:
-                min_kl_divergence = kl_divergence
-                min_kl_index = i
-            else:
-                pass
-        if min_kl_index == 0:
-            while starting_iter > 0:
-                if hist[starting_iter] == 0:
-                    starting_iter -= 1
-                    continue
-                else:
-                    break
-            min_kl_index = starting_iter
-        return (min_kl_index + 0.5) * bin_width
-
-    @staticmethod
-    def __dot(program, output_name="model.dot"):
-        '''
-        Generate the graphiz dot file for debugging.
-        '''
-        dot_graph = ""
-        dot_nodes = []
-        dot_edges = []
-        dot_graph += "digraph pm {\n"
-        for block in program.blocks:
-            ops = list(block.ops)
-            for index, op in enumerate(ops):
-                op_type = op.type
-                op_name = op_type + "_" + op.output_arg_names[0].replace(
-                    ".", "_") + "___" + str(index)
-                for name in op.input_arg_names:
-                    name = name.replace(".", "_")
-                    dot_edge = name + " -> " + op_name
-                    if dot_edge not in dot_edges:
-                        dot_edges.append(dot_edge)
-                    dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
-                    if dot_node not in dot_nodes:
-                        dot_nodes.append(dot_node)
-
-                for name in op.output_arg_names:
-                    name = name.replace(".", "_")
-                    dot_edge = op_name + " -> " + name
-                    if dot_edge not in dot_edges:
-                        dot_edges.append(dot_edge)
-                if op_type in Calibrator.supported_int8_op_type:
-                    if op_type == "conv2d" and op.has_attr(
-                            'force_fp32_output') and op.attr(
-                                "force_fp32_output"):
-                        dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
-                    else:
-                        dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
-                elif op_type in ["quantize", "dequantize"]:
-                    dot_node = op_name + " [shape=box, style=filled, color=gold]"
-                else:
-                    dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
-
-                if dot_node not in dot_nodes:
-                    dot_nodes.append(dot_node)
-
-        for dot_edge in dot_edges:
-            dot_graph += dot_edge + "\n"
-        for dot_node in dot_nodes:
-            dot_graph += dot_node + "\n"
-        dot_graph += "}"
-
-        with open(output_name, 'w') as f:
-            f.write(dot_graph)
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

-if(APPLE OR WIN32 OR NOT WITH_MKL)
-    list(REMOVE_ITEM TEST_OPS test_calibration_resnet50)
-    list(REMOVE_ITEM TEST_OPS test_calibration_mobilenetv1)
-endif()
-
 foreach(src ${TEST_OPS})
-    if(src MATCHES "test_calibration_*")
-        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI})
-    else()
        py_test(${src} SRCS ${src}.py)
-    endif()
 endforeach()
--- a/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-import unittest
-import sys
-from test_calibration_resnet50 import TestCalibration
-
-
-class TestCalibrationForMobilenetv1(TestCalibration):
-    def download_model(self):
-        # mobilenetv1 fp32 data
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        self.model_cache_folder = self.download_data(data_urls, data_md5s,
-                                                     "mobilenetv1_fp32")
-        self.model = "MobileNet-V1"
-        self.algo = "KL"
-
-    def test_calibration(self):
-        self.download_model()
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
-        (fp32_throughput, fp32_latency,
-         fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
-        print("Start INT8 calibration for {0} on {1} images ...".format(
-            self.model, self.sample_iterations * self.batch_size))
-        self.run_program(
-            self.model_cache_folder + "/model", True, algo=self.algo)
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
-        (int8_throughput, int8_latency,
-         int8_acc1) = self.run_program(self.int8_model)
-        delta_value = fp32_acc1 - int8_acc1
-        self.assertLess(delta_value, 0.01)
-        print(
-            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
-            format(self.model, self.batch_size, fp32_throughput, fp32_latency,
-                   fp32_acc1))
-        print(
-            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
-            format(self.model, self.batch_size, int8_throughput, int8_latency,
-                   int8_acc1))
-        sys.stdout.flush()
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-import unittest
-import os
-import numpy as np
-import time
-import sys
-import random
-import paddle
-import paddle.fluid as fluid
-import functools
-import contextlib
-from paddle.dataset.common import download
-from PIL import Image, ImageEnhance
-import math
-import paddle.fluid.contrib.int8_inference.utility as int8_utility
-
-random.seed(0)
-np.random.seed(0)
-
-DATA_DIM = 224
-
-THREAD = 1
-BUF_SIZE = 102400
-
-DATA_DIR = 'data/ILSVRC2012'
-
-img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-
-
-# TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator
-def resize_short(img, target_size):
-    percent = float(target_size) / min(img.size[0], img.size[1])
-    resized_width = int(round(img.size[0] * percent))
-    resized_height = int(round(img.size[1] * percent))
-    img = img.resize((resized_width, resized_height), Image.LANCZOS)
-    return img
-
-
-def crop_image(img, target_size, center):
-    width, height = img.size
-    size = target_size
-    if center == True:
-        w_start = (width - size) / 2
-        h_start = (height - size) / 2
-    else:
-        w_start = np.random.randint(0, width - size + 1)
-        h_start = np.random.randint(0, height - size + 1)
-    w_end = w_start + size
-    h_end = h_start + size
-    img = img.crop((w_start, h_start, w_end, h_end))
-    return img
-
-
-def process_image(sample, mode, color_jitter, rotate):
-    img_path = sample[0]
-
-    img = Image.open(img_path)
-
-    img = resize_short(img, target_size=256)
-    img = crop_image(img, target_size=DATA_DIM, center=True)
-
-    if img.mode != 'RGB':
-        img = img.convert('RGB')
-
-    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
-    img -= img_mean
-    img /= img_std
-
-    return img, sample[1]
-
-
-def _reader_creator(file_list,
-                    mode,
-                    shuffle=False,
-                    color_jitter=False,
-                    rotate=False,
-                    data_dir=DATA_DIR):
-    def reader():
-        with open(file_list) as flist:
-            full_lines = [line.strip() for line in flist]
-            if shuffle:
-                np.random.shuffle(full_lines)
-
-            lines = full_lines
-
-            for line in lines:
-                img_path, label = line.split()
-                img_path = os.path.join(data_dir, img_path)
-                if not os.path.exists(img_path):
-                    continue
-                yield img_path, int(label)
-
-    mapper = functools.partial(
-        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
-
-
-def val(data_dir=DATA_DIR):
-    file_list = os.path.join(data_dir, 'val_list.txt')
-    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
-
-
-class TestCalibration(unittest.TestCase):
-    def setUp(self):
-        self.int8_download = 'int8/download'
-        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
-                                               self.int8_download)
-
-        data_urls = []
-        data_md5s = []
-        self.data_cache_folder = ''
-        if os.environ.get('DATASET') == 'full':
-            data_urls.append(
-                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
-            )
-            data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
-            data_urls.append(
-                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
-            )
-            data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "full_data", False)
-        else:
-            data_urls.append(
-                'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
-            )
-            data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "small_data", False)
-
-        # reader/decorator.py requires the relative path to the data folder
-        cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
-                                                   self.data_cache_folder)
-        os.system(cmd)
-
-        self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
-        self.sample_iterations = 50 if os.environ.get(
-            'DATASET') == 'full' else 1
-        self.infer_iterations = 50000 if os.environ.get(
-            'DATASET') == 'full' else 1
-
-        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        self.int8_model = ''
-
-    def tearDown(self):
-        try:
-            os.system("rm -rf {}".format(self.int8_model))
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(self.int8_model,
-                                                         str(e)))
-
-    def cache_unzipping(self, target_folder, zip_path):
-        if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
-            os.system(cmd)
-
-    def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
-        data_cache_folder = os.path.join(self.cache_folder, folder_name)
-        zip_path = ''
-        if os.environ.get('DATASET') == 'full':
-            file_names = []
-            for i in range(0, len(data_urls)):
-                download(data_urls[i], self.int8_download, data_md5s[i])
-                file_names.append(data_urls[i].split('/')[-1])
-
-            zip_path = os.path.join(self.cache_folder,
-                                    'full_imagenet_val.tar.gz')
-            if not os.path.exists(zip_path):
-                cat_command = 'cat'
-                for file_name in file_names:
-                    cat_command += ' ' + os.path.join(self.cache_folder,
-                                                      file_name)
-                cat_command += ' > ' + zip_path
-                os.system(cat_command)
-
-        if os.environ.get('DATASET') != 'full' or is_model:
-            download(data_urls[0], self.int8_download, data_md5s[0])
-            file_name = data_urls[0].split('/')[-1]
-            zip_path = os.path.join(self.cache_folder, file_name)
-
-        print('Data is downloaded at {0}'.format(zip_path))
-        self.cache_unzipping(data_cache_folder, zip_path)
-        return data_cache_folder
-
-    def download_model(self):
-        pass
-
-    def run_program(self, model_path, generate_int8=False, algo='direct'):
-        image_shape = [3, 224, 224]
-
-        fluid.memory_optimize(fluid.default_main_program())
-
-        exe = fluid.Executor(fluid.CPUPlace())
-
-        [infer_program, feed_dict,
-         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
-
-        t = fluid.transpiler.InferenceTranspiler()
-        t.transpile(infer_program, fluid.CPUPlace())
-
-        val_reader = paddle.batch(val(), self.batch_size)
-        iterations = self.infer_iterations
-
-        if generate_int8:
-            self.int8_model = os.path.join(os.getcwd(),
-                                           "calibration_out_" + self.timestamp)
-            iterations = self.sample_iterations
-            try:
-                os.system("mkdir " + self.int8_model)
-            except Exception as e:
-                print("Failed to create {} due to {}".format(self.int8_model,
-                                                             str(e)))
-                sys.exit(-1)
-
-            calibrator = int8_utility.Calibrator(
-                program=infer_program,
-                pretrained_model=model_path,
-                algo=algo,
-                exe=exe,
-                output=self.int8_model,
-                feed_var_names=feed_dict,
-                fetch_list=fetch_targets)
-
-        test_info = []
-        cnt = 0
-        periods = []
-        for batch_id, data in enumerate(val_reader()):
-            image = np.array(
-                [x[0].reshape(image_shape) for x in data]).astype("float32")
-            label = np.array([x[1] for x in data]).astype("int64")
-            label = label.reshape([-1, 1])
-            running_program = calibrator.sampling_program.clone(
-            ) if generate_int8 else infer_program.clone()
-
-            t1 = time.time()
-            _, acc1, _ = exe.run(
-                running_program,
-                feed={feed_dict[0]: image,
-                      feed_dict[1]: label},
-                fetch_list=fetch_targets)
-            t2 = time.time()
-            period = t2 - t1
-            periods.append(period)
-
-            if generate_int8:
-                calibrator.sample_data()
-
-            test_info.append(np.mean(acc1) * len(data))
-            cnt += len(data)
-
-            if (batch_id + 1) % 100 == 0:
-                print("{0} images,".format(batch_id + 1))
-                sys.stdout.flush()
-
-            if (batch_id + 1) == iterations:
-                break
-
-        if generate_int8:
-            calibrator.save_int8_model()
-
-            print(
-                "Calibration is done and the corresponding files are generated at {}".
-                format(os.path.abspath("calibration_out")))
-        else:
-            throughput = cnt / np.sum(periods)
-            latency = np.average(periods)
-            acc1 = np.sum(test_info) / cnt
-            return (throughput, latency, acc1)
-
-
-class TestCalibrationForResnet50(TestCalibration):
-    def download_model(self):
-        # resnet50 fp32 data
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
-        ]
-        data_md5s = ['4a5194524823d9b76da6e738e1367881']
-        self.model_cache_folder = self.download_data(data_urls, data_md5s,
-                                                     "resnet50_fp32")
-        self.model = "ResNet-50"
-        self.algo = "direct"
-
-    def test_calibration(self):
-        self.download_model()
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
-        (fp32_throughput, fp32_latency,
-         fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
-        print("Start INT8 calibration for {0} on {1} images ...".format(
-            self.model, self.sample_iterations * self.batch_size))
-        self.run_program(
-            self.model_cache_folder + "/model", True, algo=self.algo)
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
-        (int8_throughput, int8_latency,
-         int8_acc1) = self.run_program(self.int8_model)
-        delta_value = fp32_acc1 - int8_acc1
-        self.assertLess(delta_value, 0.01)
-        print(
-            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
-            format(self.model, self.batch_size, fp32_throughput, fp32_latency,
-                   fp32_acc1))
-        print(
-            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
-            format(self.model, self.batch_size, int8_throughput, int8_latency,
-                   int8_acc1))
-        sys.stdout.flush()
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -110,7 +110,6 @@ packages=['paddle',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
          'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.int8_inference',
          'paddle.fluid.contrib.reader',
          'paddle.fluid.contrib.slim',
          'paddle.fluid.contrib.slim.core',