remove unused inference_transpiler unit-tests (#19130)

* remove unused inference_transpiler unit-tests test=develop * remove InferenceTranspiler usage in quantize_transpiler.py test=develop

remove unused inference_transpiler unit-tests (#19130)
* remove unused inference_transpiler unit-tests test=develop * remove InferenceTranspiler usage in quantize_transpiler.py test=develop
2f8c7e02 · Tao Luo · GitHub · 708bd979 · 2f8c7e02 · 2f8c7e02
6 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -472,7 +472,7 @@ paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None,
 paddle.fluid.contrib.QuantizeTranspiler ('paddle.fluid.contrib.quantize.quantize_transpiler.QuantizeTranspiler', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
 paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5'))
 paddle.fluid.contrib.Compressor ('paddle.fluid.contrib.slim.core.compressor.Compressor', ('document', 'a5417774a94aa9ae5560a42b96527e7d'))

--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -25,7 +25,6 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.nn import autoincreased_step_counter
 from paddle.fluid.framework import Variable
 from paddle.fluid.executor import global_scope
-from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
 __all__ = ['QuantizeTranspiler']
@@ -221,7 +220,7 @@ class QuantizeTranspiler(object):
            self.activation_quantize_type == 'range_abs_max':
            self.global_step = autoincreased_step_counter()
-    def freeze_program(self, program, place, fuse_bn=False, scope=None):
+    def freeze_program(self, program, place, scope=None):
        """Freeze input training program for inference.
        Args:
@@ -232,10 +231,6 @@ class QuantizeTranspiler(object):
        scope = global_scope() if scope is None else scope
        program = default_main_program() if program is None else program
-        if fuse_bn:
-            bn_fuse_transpiler = BNFuseTranspiler()
-            bn_fuse_transpiler.transpile(program, place)
        persistable_vars = [
            v.name
            for v in filter(lambda var: var.persistable, program.list_vars())
@@ -564,58 +559,3 @@ class QuantizeTranspiler(object):
                    'Scale': scale},
            outputs={"Out": dequant_var})
        return dequant_var
-class BNFuseTranspiler(InferenceTranspiler):
-    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
-        def _update_param(op, param_name, new_param):
-            var = self.block.vars[param_name]
-            tensor = self.scope.find_var(param_name).get_tensor()
-            tensor.set(np.array(new_param), self.place)
-        def _load_param(param_name):
-            return np.array(self.scope.find_var(param_name).get_tensor())
-        bias_bn = _load_param(bn_op.input("Bias")[0])  #Bias
-        scale_bn = _load_param(bn_op.input("Scale")[0])  #Scale
-        mean_bn = _load_param(bn_op.input("Mean")[0])  #Mean
-        var_bn = _load_param(bn_op.input("Variance")[0])  #Variance
-        if current_op.type in ['conv2d', 'depthwise_conv2d']:
-            current_param = _load_param(
-                _original_var_name(current_op.input("Filter")[0]))
-        elif current_op.type == 'mul':
-            current_param = _load_param(
-                _original_var_name(current_op.input("Y")[0]))
-        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
-        tmp = np.float32(np.divide(scale_bn, std_bn))
-        # add bias of batch_norm_op to conv2d
-        if with_bias:
-            bias = _load_param(bias_op.input("Y"))
-        else:
-            bias = np.zeros(bias_bn.shape)
-        bias = np.float32(
-            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
-        # re-compute weight of conv2d/fc
-        tmp = tmp.reshape(tmp.shape[0], -1)
-        dst_param = current_param.reshape((tmp.shape[0], -1))
-        dst_param = np.float32(np.multiply(dst_param, tmp))
-        dst_param = dst_param.reshape(current_param.shape)
-        # update parameters
-        if current_op.type in ['conv2d', 'depthwise_conv2d']:
-            _update_param(current_op,
-                          _original_var_name(current_op.input("Filter")[0]),
-                          dst_param)
-        elif current_op.type == 'mul':
-            _update_param(current_op,
-                          _original_var_name(current_op.input("Y")[0]),
-                          dst_param)
-        _update_param(bias_op, bias_op.input("Y")[0], bias)
-        # collect the renamed input
-        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -242,31 +242,16 @@ def infer(use_cuda, save_dirname=None):
        batch_size = 1
        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
-        # Use inference_transpiler to speedup
-        inference_transpiler_program = inference_program.clone()
-        t = fluid.transpiler.InferenceTranspiler()
-        t.transpile(inference_transpiler_program, place)
        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.
        results = exe.run(inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)
-        transpiler_results = exe.run(inference_transpiler_program,
-                                     feed={feed_target_names[0]: tensor_img},
-                                     fetch_list=fetch_targets)
-        assert len(results[0]) == len(transpiler_results[0])
-        for i in range(len(results[0])):
-            np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=4)
        print("infer results: ", results[0])
        fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe,
+                                      fetch_targets, exe, inference_program)
-                                      inference_transpiler_program)
 def main(net_type, use_cuda, is_local=True):

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -221,31 +221,16 @@ def infer(use_cuda, save_dirname=None):
        batch_size = 1
        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
-        # Use inference_transpiler to speedup
-        inference_transpiler_program = inference_program.clone()
-        t = fluid.transpiler.InferenceTranspiler()
-        t.transpile(inference_transpiler_program, place)
        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.
        results = exe.run(inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)
-        transpiler_results = exe.run(inference_transpiler_program,
-                                     feed={feed_target_names[0]: tensor_img},
-                                     fetch_list=fetch_targets)
-        assert len(results[0]) == len(transpiler_results[0])
-        for i in range(len(results[0])):
-            np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=4)
        print("infer results: ", results[0])
        fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe,
+                                      fetch_targets, exe, inference_program)
-                                      inference_transpiler_program)
 def main(net_type, use_cuda, is_local=True):

--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -15,7 +15,6 @@
 from __future__ import print_function
 from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
-from .inference_transpiler import InferenceTranspiler
 from .memory_optimization_transpiler import memory_optimize, release_memory
 from .ps_dispatcher import HashName, RoundRobin

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import sys
-import numpy as np
-from .. import core
-from ..framework import Program
-from ..executor import global_scope
-class InferenceTranspiler(object):
-    '''
-    Convert the fluid program to optimized inference program.
-    There are several optimizations:
-      - fuse convolution and batch normalization
-      - fuse batch normalization and relu (MKLDNN only)
-    Examples:
-    .. code-block:: python
-        # As InferenceTranspiler will modify the original program,
-        # please clone before use it.
-        inference_transpiler_program = program.clone()
-        t = fluid.InferenceTranspiler()
-        t.transpile(inference_transpiler_program, place)
-    '''
-    def transpile(self, program, place, scope=None):
-        '''
-        Run the transpiler.
-        Args:
-            program (Program): program to transpile
-            place (Place): inference place
-            scope (Scope|None): inference Scope
-        '''
-        sys.stderr.write("InferenceTranspiler is deprecated since it's not "
-                         "safe. Users should be "
-                         "responsible for constructing the inference program\n")
-        if not isinstance(program, Program):
-            raise TypeError("program should be as Program type")
-        if not isinstance(place, core.CPUPlace) and not isinstance(
-                place, core.CUDAPlace):
-            raise TypeError("place should be as CPUPlace/CUDAPlace type")
-        if scope is None:
-            scope = global_scope()
-        if not isinstance(scope, core._Scope):
-            raise TypeError("scope should be as Scope type or None")
-        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
-        if use_mkldnn:
-            self._depthwise_conv_mkldnn(program)
-        self._fuse_batch_norm(program, place, scope)
-        if use_mkldnn:
-            self._fuse_conv_bias_mkldnn(program)
-            self._fuse_conv_relu_mkldnn(program)
-            self._fuse_conv_eltwise_mkldnn(program)
-            self._fuse_conv_relu_mkldnn(
-                program)  # ResNet residual block merging
-            self._fuse_bn_relu_mkldnn(program)
-            self._fuse_mul_add_mkldnn(program)
-        self._is_test_pass(program)
-    def _is_test_pass(self, program):
-        '''
-        Transpile the program setting is_test = true for all layers and
-        inserts is_test attribute to pooling and activation layers.
-        As a result some operators might run faster
-        :param program: program to transpile
-        :type program: Program
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops):
-            current_op = self.block.ops[i]
-            if current_op.has_attr("is_test"):
-                current_op._set_attr("is_test", True)
-            elif current_op.type in [
-                    "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp",
-                    "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh",
-                    "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos",
-                    "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid",
-                    "relu6", "soft_relu", "swish", "thresholded_relu", "log",
-                    "square", "softplus", "softsign"
-            ]:
-                current_op._set_attr("is_test", True)
-            i = i + 1
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _depthwise_conv_mkldnn(self, program):
-        '''
-        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
-        The result is:
-            - before:
-                - any_other_op->depthwise_conv->any_other_op
-            - after:
-                - any_other_op->conv->any_other_op
-        :param program: program to transpile
-        :type program: Program
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops):
-            current_op = self.block.ops[i]
-            if current_op.type == 'depthwise_conv2d':
-                current_op.desc.set_type("conv2d")
-            i = i + 1
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_conv_eltwise_mkldnn(self, program):
-        '''
-        Transpile the program fusing elementwise_add into conv for MKLDNN
-        program. Elementwise add following convolution OP can be fused by adding
-        'fuse_residual_connection' attribute to convolution OP and replacing its output
-        Tensor with second parameter of elementwise_add.
-        The result of fuse is:
-            - before:
-                - conv->elementwise_add->any_other_op
-            - after:
-                - conv->any_other_op
-        :param program: program to transpile
-        :type program: Program
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops):
-            current_op = self.block.ops[i]
-            if current_op.type in ['conv2d']:
-                next_op = self.block.ops[i + 1]
-                if next_op.type == 'elementwise_add':
-                    self._fuse_conv_eltwise(i, current_op, next_op)
-                    self.block._remove_op(i + 1)  # Remove old conv
-                    self.block._remove_op(i + 1)  # Remove elementwise_add
-            i = i + 1
-        self._adjust_input()
-        self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_conv_relu_mkldnn(self, program):
-        '''
-        Transpile the program by fused relu activation for MKLDNN program.
-        Relu activation following convolution OP can be fused by adding
-        'fuse_relu' attribute to convolution OP.
-        The result of fuse is:
-            - before:
-                - conv->relu->any_other_op
-            - after:
-                - conv->any_other_op
-        :param program: program to transpile
-        :type program: Program
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops):
-            current_op = self.block.ops[i]
-            if current_op.type in ['conv2d']:
-                next_op = self.block.ops[i + 1]
-                if next_op.type == 'relu':
-                    # modify bnorm OP to include relu
-                    current_op._set_attr("fuse_relu", True)
-                    # remove relu OP
-                    self.block._remove_op(i + 1)
-            i = i + 1
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_bn_relu_mkldnn(self, program):
-        '''
-        Transpile the program by fused relu activation for MKLDNN program.
-        Relu activation following batch norm OP can be fused by adding
-        :math:`fuse_with_relu` attribute to batch norm OP.
-        The result of fuse is:
-        - before:
-          - batch_norm->relu->any_other_op
-        - after:
-          - batch_norm->any_other_op
-        :param program: program to transpile
-        :type program: Program
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops) - 1:
-            current_op = self.block.ops[i]
-            if current_op.type in ['batch_norm']:
-                next_op = self.block.ops[i + 1]
-                if next_op.type == 'relu':
-                    # modify bnorm OP to include relu
-                    current_op._set_attr("fuse_with_relu", True)
-                    # remove relu OP
-                    self.block._remove_op(i + 1)
-            i = i + 1
-        self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_conv_bias_mkldnn(self, program):
-        '''
-        Transpile the program by fused convolution and elementwise_add.
-        Replace conv2d and elementwise_add ops with a new conv2d op
-        based on an old conv2d op and the :math:`Bias` taken from
-        elementwise_add.
-        For input :math:`X`:
-        - Conv process:            :math:`X = input * W`
-        - Elementwise_add process: :math` X = X + bias`
-        After fuse into one operation:
-        .. math::
-            X = input * W + bias
-        The operator transformation is:
-        - before:
-          - conv->elementwise_add->any_other_op
-        - after:
-          - conv->any_other_op
-        The transpile stages are:
-        1. Extract bias and output variables from elementwise_add.
-        2. Extract Input, Weight and attributes from conv op.
-        3. Create a new convolution op based on extracted params.
-        4. Remove old conv op.
-        5. Remove elementwise_add.
-        5. Remove unused variables.
-        Args:
-            program (Program): program to transpile
-        '''
-        self.block = program.block(0)
-        i = 0
-        while i < len(self.block.ops) - 2:
-            current_op = self.block.ops[i]
-            next_op = self.block.ops[i + 1]
-            # conv2d with bias
-            if current_op.type in ['conv2d'] and \
-               next_op.type in ['elementwise_add']:
-                self._fuse_conv_bias(i, current_op, next_op)
-                self.block._remove_op(i + 1)  # Remove old conv
-                self.block._remove_op(i + 1)  # Remove elementwise_add
-            i = i + 1
-        self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_batch_norm(self, program, place, scope):
-        '''
-        Transpile the program by fused batch normalization.
-        The batch normalization followed the convolution or fully connected layer
-        can be integrated with them. Doing so will give us a forward acceleration,
-        especially in environments like mobile or embedded.
-        For input :math:`X`:
-        - Conv process:        :math:`X = input * W + bias`
-        - Batch norm process:  :math:`X' = (X - mean) / std`
-        - Scale Process:       :math:`Y = a * X' + b`
-        After fuse into one operation:
-        .. math::
-            Y &= (input * W + bias - mean) / std * a + b \\\\
-              &= input * a * W / std + ((bias - mean) / std * a + b)
-        The operator transformation is:
-        - before:
-          - conv->batch_norm->any_other_op (bias == 0)
-          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
-        - after:
-          - conv->elementwise_add->any_other_op
-        The transpile stages are:
-        1. insert elementwise_add op when bias == 0.
-        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
-        3. remove batch_norm ops which are not used in any other ops.
-        4. adjust the input of any_other_op to be the output of elementwise_add operator.
-        5. remove unused variables.
-        Args:
-            program (Program): program to transpile
-            place (Place): inference place
-            scope (Scope): inference Scope
-        '''
-        self.scope = scope
-        self.place = place
-        self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted
-        i = 0
-        while i < len(self.block.ops) - 2:
-            current_op = self.block.ops[i]
-            # TODO(luotao1): consider only conv2d now. fc would be delt later.
-            if current_op.type in ['conv2d']:
-                # TODO(luotao1): consider single chain network now.
-                # For branch network, we counldn't use block.ops[i + 1] as
-                # the judgment condition.
-                next_op = self.block.ops[i + 1]
-                # conv2d without bias
-                if (next_op.type == 'batch_norm'):
-                    # insert bias op
-                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
-                    # fuse batch_norm
-                    self._fuse_param(current_op, next_op, bias_op, 0)
-                    # remove batch_norm_op
-                    self.block._remove_op(i + 2)
-                    i = i + 1
-                # conv2d with bias, the next_op.type is elementwise_add
-                elif (next_op.type == 'elementwise_add'):
-                    next_next_op = self.block.ops[i + 2]
-                    if (next_next_op.type == 'batch_norm'):
-                        # fuse batch_norm
-                        self._fuse_param(current_op, next_next_op, next_op, 1)
-                        # remove batch_norm_op
-                        self.block._remove_op(i + 2)
-                        i = i + 1
-            i = i + 1
-        self._adjust_input()
-        self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force,
-        # since some large program.desc will not be flushed immediately.
-        # And a better solution will be considered later.
-        program = program.clone()
-    def _fuse_mul_add_mkldnn(self, program):
-        '''
-        Transpile the program by fusing Mul+Add layers to FC layer with the MKL-DNN inner product.
-        The MUL following a Elementwise_add layer can be replaced by the MKL-DNN FC.
-        The Elementwise add's bias input 'Y' has to be added into the
-        MKL-DNN-based FC input 'Bias'.
-         The operator transformation is:
-         - before:
-           - MUL->elementwise_add -> any_other_op
-         - after:
-           - FC -> any_other_op
-         The transpile stages are:
-         1. insert a new MKL-DNN-based FC operator with `Bias` input
-            taken from the Elementwise add's input 'Y' (bias),
-        2. fuse the parameters of MUL and Elemenwise add,
-        3. remove the MUL, elementwise_add operators,
-        4. make the input of the deleted Elementwise add operator to be the input of the
-           new FC operator,
-        5. remove unused variables,
-         Args:
-            program (Program): program to transpile
-         '''
-        self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted
-        i = 0
-        while i < len(self.block.ops):
-            # find a elementwise add op
-            if self.block.ops[i].type == 'elementwise_add':
-                add_op = self.block.ops[i]
-                add_idx = i
-                mul_idx = -1
-                # find the preceding mul op
-                for j in reversed(range(add_idx)):
-                    if self.block.ops[j].type == 'mul':
-                        mul_out_name = self.block.ops[j].output_arg_names[0]
-                        if self.block.ops[j].output_arg_names[
-                                0] in add_op.input_arg_names:
-                            mul_op = self.block.ops[j]
-                            mul_idx = j
-                            break
-                if mul_idx < 0:
-                    i += 1
-                    continue
-                # create and insert a new fc op
-                fc_op_new = self._insert_fc_op(add_idx + 1, mul_op, add_op)
-                # remove the old operators
-                self.block._remove_op(add_idx)
-                self.block._remove_op(mul_idx)
-                # restart scanning for elementwise add from the deleted mul's index
-                i = mul_idx
-            i += 1
-        self._adjust_input()
-        self._remove_unused_var()
-        program = program.clone()
-    # ====================== private transpiler functions =====================
-    def _insert_bias_op(self, index, current_op, bn_op):
-        '''
-        Construct elementwise_add operator for adding bias
-        and insert it into program.
-        :param index: insert location of bias_op
-        :type index: Int
-        :param current_op: current operator (conv or fc)
-        :type current_op: Operator
-        :param bn_op: batch norm operator
-        :type bn_op: Operator
-        :return: bias_op
-        :rtype: Operator
-        '''
-        # The input of bias_op is current_op's output and Bias of bn_op
-        # The output of bias_op is bn_op's output
-        x_var = self.block.var(current_op.output("Output")[0])
-        y_var = self.block.var(bn_op.input("Bias")[0])
-        out_var = self.block.var(bn_op.output("Y")[0])
-        bias_op = self.block._insert_op(
-            index,
-            type="elementwise_add",
-            inputs={"X": x_var,
-                    "Y": y_var},
-            outputs={"Out": out_var},
-            attrs={"axis": 1})  # dim_start=1
-        return bias_op
-    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
-        '''
-        fuse the batch_norm_op' parameters to current_op (conv or fc)
-        :param current_op: current operator (conv or fc)
-        :type current_op: Operator
-        :param bn_op: batch norm operator
-        :type bn_op: Operator
-        :param bias_op: elementwise_add operator for adding bias
-        :type bias_op: Operator
-        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
-        :type with_bias: Int
-        '''
-        def _update_param(op, old_param_name, new_param):
-            # For the sake of remaining the original variables the same as before,
-            # create new variables in scope to store the new parameters.
-            old_param_name = old_param_name[0]
-            old_var = self.block.vars[old_param_name]
-            new_param_name = old_param_name + '_fuse_bn'
-            new_var = self.block.create_parameter(
-                name=new_param_name.encode('ascii'),
-                type=old_var.type,
-                dtype=old_var.dtype,
-                shape=old_var.shape)
-            op._rename_input(old_param_name, new_param_name)
-            self.scope.var(new_param_name)
-            tensor = self.scope.find_var(new_param_name).get_tensor()
-            tensor.set(np.array(new_param), self.place)
-        def _load_param(param_name):
-            return np.array(self.scope.find_var(param_name[0]).get_tensor())
-        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
-        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
-        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
-        var_bn = _load_param(bn_op.input("Variance"))  #Variance
-        # TODO(luotao1): consider only conv2d now. fc would be delt later.
-        current_param = _load_param(current_op.input("Filter"))
-        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
-        tmp = np.float32(np.divide(scale_bn, std_bn))
-        # add bias of batch_norm_op to conv2d
-        if with_bias:
-            bias = _load_param(bias_op.input("Y"))
-        else:
-            bias = np.zeros(bias_bn.shape)
-        bias = np.float32(
-            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
-        # re-compute weight of conv2d
-        tmp = tmp.reshape(tmp.shape[0], -1)
-        dst_param = current_param.reshape((tmp.shape[0], -1))
-        dst_param = np.float32(np.multiply(dst_param, tmp))
-        dst_param = dst_param.reshape(current_param.shape)
-        # update parameters
-        _update_param(current_op, current_op.input("Filter"), dst_param)
-        _update_param(bias_op, bias_op.input("Y"), bias)
-        # collect the renamed input
-        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
-    def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
-        '''
-        fuse the conv op with elementwise_add
-        :param index: index of the conv_op in ops list
-        :type index: Int
-        :param conv_op: convolution operator
-        :type conv_op: Operator
-        :param elementwise_add_op: convolution's bias operator
-        :type elementwise_add_op: Operator
-        '''
-        bias_var = self.block.var(elementwise_add_op.input("Y")[0])
-        out_var = self.block.var(elementwise_add_op.output("Out")[0])
-        filter_var = self.block.var(conv_op.input("Filter")[0])
-        in_var = self.block.var(conv_op.input("Input")[0])
-        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
-        self.block._insert_op(
-            index,
-            type="conv2d",
-            inputs={"Input": in_var,
-                    "Filter": filter_var,
-                    "Bias": bias_var},
-            outputs={"Output": out_var},
-            attrs=attrs)
-    def _insert_fc_op(self, index, mul_op, add_op):
-        '''
-        Construct a new FC operator by copying the old Mul and adding the
-        'Y' input taken from the Elementwise add's input 'Y'.
-        :param index: insert location of FC
-        :type  index: Int
-        :param mul_op: MUL operator to be copied
-        :type  mul_op: Operator
-        :param add_op: Elementwise add operator taken bias from
-        :type  add_op: Operator
-        :return: fc_op_new
-        :type:   Operator
-        '''
-        def get_op_outputs(op, names):
-            result = {}
-            for name in names:
-                result[name] = self.block.var(op.output(name)[0])
-            return result
-        fc_inputs = {}
-        fc_inputs['Input'] = self.block.var(mul_op.input('X')[0])
-        fc_inputs['W'] = self.block.var(mul_op.input('Y')[0])
-        fc_inputs['Bias'] = self.block.var(add_op.input('Y')[0])
-        fc_outputs = get_op_outputs(add_op, ['Out'])
-        fc_attrs = {}
-        fc_attrs['use_mkldnn'] = True
-        fc_op_new = self.block._insert_op(
-            index,
-            type='fc',
-            inputs=fc_inputs,
-            outputs=fc_outputs,
-            attrs=fc_attrs)
-        return fc_op_new
-    def _fuse_conv_eltwise(self, index, conv_op, eltwise_op):
-        '''
-        fuse the conv op with elementwise_add
-        :param conv_op: convolution operator
-        :type conv_op: Operator
-        :param eltwise_op: operator adding data from skip connection
-        :type eltwise_op: Operator
-        '''
-        eltwise_input = "X"
-        if eltwise_op.input("X")[0] == conv_op.output("Output")[0]:
-            eltwise_input = "Y"
-        residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]]
-        out_var = self.block.vars[eltwise_op.output("Out")[0]]
-        filter_var = self.block.vars[conv_op.input("Filter")[0]]
-        in_var = self.block.vars[conv_op.input("Input")[0]]
-        bias_var = self.block.vars[conv_op.input("Bias")[0]]
-        conv_op._set_attr("fuse_residual_connection", True)
-        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
-        self.block._insert_op(
-            index,
-            type="conv2d",
-            inputs={
-                "Input": in_var,
-                "Filter": filter_var,
-                "Bias": bias_var,
-                "ResidualData": residual_var
-            },
-            outputs={"Output": out_var},
-            attrs=attrs)
-    def _adjust_input(self):
-        for i in range(len(self.block.ops)):
-            current_op = self.block.ops[i]
-            for input_arg in current_op.input_arg_names:
-                if input_arg in self.input_map:
-                    current_op._rename_input(input_arg,
-                                             self.input_map[input_arg])
-    def _remove_unused_var(self):
-        '''
-        remove unused varibles in program
-        '''
-        args = []
-        for i in range(len(self.block.ops)):
-            current_op = self.block.ops[i]
-            args += current_op.input_arg_names
-            args += current_op.output_arg_names
-        args = list(set(args))  # unique the input and output arguments
-        for var in list(self.block.vars.keys()):
-            if var not in args:
-                self.block._remove_var(var)