# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import os import sys import numpy as np from .. import core from ..framework import Program from ..executor import global_scope class InferenceTranspiler(object): ''' Convert the fluid program to optimized inference program. There are several optimizations: - fuse convolution and batch normalization - fuse batch normalization and relu (MKLDNN only) Examples: .. code-block:: python # As InferenceTranspiler will modify the original program, # please clone before use it. inference_transpiler_program = program.clone() t = fluid.InferenceTranspiler() t.transpile(inference_transpiler_program, place) ''' def transpile(self, program, place, scope=None): ''' Run the transpiler. Args: program (Program): program to transpile place (Place): inference place scope (Scope|None): inference Scope ''' sys.stderr.write("InferenceTranspiler is deprecated since it's not " "safe. Users should be " "responsible for constructing the inference program\n") if not isinstance(program, Program): raise TypeError("program should be as Program type") if not isinstance(place, core.CPUPlace) and not isinstance( place, core.CUDAPlace): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) if use_mkldnn: self._depthwise_conv_mkldnn(program) self._fuse_batch_norm(program, place, scope) if use_mkldnn: self._fuse_conv_bias_mkldnn(program) self._fuse_conv_relu_mkldnn(program) self._fuse_conv_eltwise_mkldnn(program) self._fuse_conv_relu_mkldnn( program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) self._fuse_mul_add_mkldnn(program) self._is_test_pass(program) def _is_test_pass(self, program): ''' Transpile the program setting is_test = true for all layers and inserts is_test attribute to pooling and activation layers. As a result some operators might run faster :param program: program to transpile :type program: Program ''' self.block = program.block(0) i = 0 while i < len(self.block.ops): current_op = self.block.ops[i] if current_op.has_attr("is_test"): current_op._set_attr("is_test", True) elif current_op.type in [ "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp", "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh", "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos", "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", "softsign" ]: current_op._set_attr("is_test", True) i = i + 1 # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _depthwise_conv_mkldnn(self, program): ''' Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program. The result is: - before: - any_other_op->depthwise_conv->any_other_op - after: - any_other_op->conv->any_other_op :param program: program to transpile :type program: Program ''' self.block = program.block(0) i = 0 while i < len(self.block.ops): current_op = self.block.ops[i] if current_op.type == 'depthwise_conv2d': current_op.desc.set_type("conv2d") i = i + 1 # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_conv_eltwise_mkldnn(self, program): ''' Transpile the program fusing elementwise_add into conv for MKLDNN program. Elementwise add following convolution OP can be fused by adding 'fuse_residual_connection' attribute to convolution OP and replacing its output Tensor with second parameter of elementwise_add. The result of fuse is: - before: - conv->elementwise_add->any_other_op - after: - conv->any_other_op :param program: program to transpile :type program: Program ''' self.block = program.block(0) i = 0 while i < len(self.block.ops): current_op = self.block.ops[i] if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'elementwise_add': self._fuse_conv_eltwise(i, current_op, next_op) self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add i = i + 1 self._adjust_input() self._remove_unused_var() # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_conv_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. Relu activation following convolution OP can be fused by adding 'fuse_relu' attribute to convolution OP. The result of fuse is: - before: - conv->relu->any_other_op - after: - conv->any_other_op :param program: program to transpile :type program: Program ''' self.block = program.block(0) i = 0 while i < len(self.block.ops): current_op = self.block.ops[i] if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu current_op._set_attr("fuse_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_bn_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. Relu activation following batch norm OP can be fused by adding :math:`fuse_with_relu` attribute to batch norm OP. The result of fuse is: - before: - batch_norm->relu->any_other_op - after: - batch_norm->any_other_op :param program: program to transpile :type program: Program ''' self.block = program.block(0) i = 0 while i < len(self.block.ops) - 1: current_op = self.block.ops[i] if current_op.type in ['batch_norm']: next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu current_op._set_attr("fuse_with_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 self._remove_unused_var() # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_conv_bias_mkldnn(self, program): ''' Transpile the program by fused convolution and elementwise_add. Replace conv2d and elementwise_add ops with a new conv2d op based on an old conv2d op and the :math:`Bias` taken from elementwise_add. For input :math:`X`: - Conv process: :math:`X = input * W` - Elementwise_add process: :math` X = X + bias` After fuse into one operation: .. math:: X = input * W + bias The operator transformation is: - before: - conv->elementwise_add->any_other_op - after: - conv->any_other_op The transpile stages are: 1. Extract bias and output variables from elementwise_add. 2. Extract Input, Weight and attributes from conv op. 3. Create a new convolution op based on extracted params. 4. Remove old conv op. 5. Remove elementwise_add. 5. Remove unused variables. Args: program (Program): program to transpile ''' self.block = program.block(0) i = 0 while i < len(self.block.ops) - 2: current_op = self.block.ops[i] next_op = self.block.ops[i + 1] # conv2d with bias if current_op.type in ['conv2d'] and \ next_op.type in ['elementwise_add']: self._fuse_conv_bias(i, current_op, next_op) self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add i = i + 1 self._remove_unused_var() # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_batch_norm(self, program, place, scope): ''' Transpile the program by fused batch normalization. The batch normalization followed the convolution or fully connected layer can be integrated with them. Doing so will give us a forward acceleration, especially in environments like mobile or embedded. For input :math:`X`: - Conv process: :math:`X = input * W + bias` - Batch norm process: :math:`X' = (X - mean) / std` - Scale Process: :math:`Y = a * X' + b` After fuse into one operation: .. math:: Y &= (input * W + bias - mean) / std * a + b \\\\ &= input * a * W / std + ((bias - mean) / std * a + b) The operator transformation is: - before: - conv->batch_norm->any_other_op (bias == 0) - conv->elementwise_add->batch_norm->any_other_op (bias != 0) - after: - conv->elementwise_add->any_other_op The transpile stages are: 1. insert elementwise_add op when bias == 0. 2. fuse the batch_norm's parameters to conv and elementwise_add operators. 3. remove batch_norm ops which are not used in any other ops. 4. adjust the input of any_other_op to be the output of elementwise_add operator. 5. remove unused variables. Args: program (Program): program to transpile place (Place): inference place scope (Scope): inference Scope ''' self.scope = scope self.place = place self.block = program.block(0) self.input_map = {} # store the input names should be adjusted i = 0 while i < len(self.block.ops) - 2: current_op = self.block.ops[i] # TODO(luotao1): consider only conv2d now. fc would be delt later. if current_op.type in ['conv2d']: # TODO(luotao1): consider single chain network now. # For branch network, we counldn't use block.ops[i + 1] as # the judgment condition. next_op = self.block.ops[i + 1] # conv2d without bias if (next_op.type == 'batch_norm'): # insert bias op bias_op = self._insert_bias_op(i + 1, current_op, next_op) # fuse batch_norm self._fuse_param(current_op, next_op, bias_op, 0) # remove batch_norm_op self.block._remove_op(i + 2) i = i + 1 # conv2d with bias, the next_op.type is elementwise_add elif (next_op.type == 'elementwise_add'): next_next_op = self.block.ops[i + 2] if (next_next_op.type == 'batch_norm'): # fuse batch_norm self._fuse_param(current_op, next_next_op, next_op, 1) # remove batch_norm_op self.block._remove_op(i + 2) i = i + 1 i = i + 1 self._adjust_input() self._remove_unused_var() # TODO(luotao): use clone() method to flush the program.desc in force, # since some large program.desc will not be flushed immediately. # And a better solution will be considered later. program = program.clone() def _fuse_mul_add_mkldnn(self, program): ''' Transpile the program by fusing Mul+Add layers to FC layer with the MKL-DNN inner product. The MUL following a Elementwise_add layer can be replaced by the MKL-DNN FC. The Elementwise add's bias input 'Y' has to be added into the MKL-DNN-based FC input 'Bias'. The operator transformation is: - before: - MUL->elementwise_add -> any_other_op - after: - FC -> any_other_op The transpile stages are: 1. insert a new MKL-DNN-based FC operator with `Bias` input taken from the Elementwise add's input 'Y' (bias), 2. fuse the parameters of MUL and Elemenwise add, 3. remove the MUL, elementwise_add operators, 4. make the input of the deleted Elementwise add operator to be the input of the new FC operator, 5. remove unused variables, Args: program (Program): program to transpile ''' self.block = program.block(0) self.input_map = {} # store the input names should be adjusted i = 0 while i < len(self.block.ops): # find a elementwise add op if self.block.ops[i].type == 'elementwise_add': add_op = self.block.ops[i] add_idx = i mul_idx = -1 # find the preceding mul op for j in reversed(range(add_idx)): if self.block.ops[j].type == 'mul': mul_out_name = self.block.ops[j].output_arg_names[0] if self.block.ops[j].output_arg_names[ 0] in add_op.input_arg_names: mul_op = self.block.ops[j] mul_idx = j break if mul_idx < 0: i += 1 continue # create and insert a new fc op fc_op_new = self._insert_fc_op(add_idx + 1, mul_op, add_op) # remove the old operators self.block._remove_op(add_idx) self.block._remove_op(mul_idx) # restart scanning for elementwise add from the deleted mul's index i = mul_idx i += 1 self._adjust_input() self._remove_unused_var() program = program.clone() # ====================== private transpiler functions ===================== def _insert_bias_op(self, index, current_op, bn_op): ''' Construct elementwise_add operator for adding bias and insert it into program. :param index: insert location of bias_op :type index: Int :param current_op: current operator (conv or fc) :type current_op: Operator :param bn_op: batch norm operator :type bn_op: Operator :return: bias_op :rtype: Operator ''' # The input of bias_op is current_op's output and Bias of bn_op # The output of bias_op is bn_op's output x_var = self.block.var(current_op.output("Output")[0]) y_var = self.block.var(bn_op.input("Bias")[0]) out_var = self.block.var(bn_op.output("Y")[0]) bias_op = self.block._insert_op( index, type="elementwise_add", inputs={"X": x_var, "Y": y_var}, outputs={"Out": out_var}, attrs={"axis": 1}) # dim_start=1 return bias_op def _fuse_param(self, current_op, bn_op, bias_op, with_bias): ''' fuse the batch_norm_op' parameters to current_op (conv or fc) :param current_op: current operator (conv or fc) :type current_op: Operator :param bn_op: batch norm operator :type bn_op: Operator :param bias_op: elementwise_add operator for adding bias :type bias_op: Operator :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. :type with_bias: Int ''' def _update_param(op, old_param_name, new_param): # For the sake of remaining the original variables the same as before, # create new variables in scope to store the new parameters. old_param_name = old_param_name[0] old_var = self.block.vars[old_param_name] new_param_name = old_param_name + '_fuse_bn' new_var = self.block.create_parameter( name=new_param_name.encode('ascii'), type=old_var.type, dtype=old_var.dtype, shape=old_var.shape) op._rename_input(old_param_name, new_param_name) self.scope.var(new_param_name) tensor = self.scope.find_var(new_param_name).get_tensor() tensor.set(np.array(new_param), self.place) def _load_param(param_name): return np.array(self.scope.find_var(param_name[0]).get_tensor()) bias_bn = _load_param(bn_op.input("Bias")) #Bias scale_bn = _load_param(bn_op.input("Scale")) #Scale mean_bn = _load_param(bn_op.input("Mean")) #Mean var_bn = _load_param(bn_op.input("Variance")) #Variance # TODO(luotao1): consider only conv2d now. fc would be delt later. current_param = _load_param(current_op.input("Filter")) std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5))) tmp = np.float32(np.divide(scale_bn, std_bn)) # add bias of batch_norm_op to conv2d if with_bias: bias = _load_param(bias_op.input("Y")) else: bias = np.zeros(bias_bn.shape) bias = np.float32( np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn)) # re-compute weight of conv2d tmp = tmp.reshape(tmp.shape[0], -1) dst_param = current_param.reshape((tmp.shape[0], -1)) dst_param = np.float32(np.multiply(dst_param, tmp)) dst_param = dst_param.reshape(current_param.shape) # update parameters _update_param(current_op, current_op.input("Filter"), dst_param) _update_param(bias_op, bias_op.input("Y"), bias) # collect the renamed input self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] def _fuse_conv_bias(self, index, conv_op, elementwise_add_op): ''' fuse the conv op with elementwise_add :param index: index of the conv_op in ops list :type index: Int :param conv_op: convolution operator :type conv_op: Operator :param elementwise_add_op: convolution's bias operator :type elementwise_add_op: Operator ''' bias_var = self.block.var(elementwise_add_op.input("Y")[0]) out_var = self.block.var(elementwise_add_op.output("Out")[0]) filter_var = self.block.var(conv_op.input("Filter")[0]) in_var = self.block.var(conv_op.input("Input")[0]) attrs = {name: conv_op.attr(name) for name in conv_op.attr_names} self.block._insert_op( index, type="conv2d", inputs={"Input": in_var, "Filter": filter_var, "Bias": bias_var}, outputs={"Output": out_var}, attrs=attrs) def _insert_fc_op(self, index, mul_op, add_op): ''' Construct a new FC operator by copying the old Mul and adding the 'Y' input taken from the Elementwise add's input 'Y'. :param index: insert location of FC :type index: Int :param mul_op: MUL operator to be copied :type mul_op: Operator :param add_op: Elementwise add operator taken bias from :type add_op: Operator :return: fc_op_new :type: Operator ''' def get_op_outputs(op, names): result = {} for name in names: result[name] = self.block.var(op.output(name)[0]) return result fc_inputs = {} fc_inputs['Input'] = self.block.var(mul_op.input('X')[0]) fc_inputs['W'] = self.block.var(mul_op.input('Y')[0]) fc_inputs['Bias'] = self.block.var(add_op.input('Y')[0]) fc_outputs = get_op_outputs(add_op, ['Out']) fc_attrs = {} fc_attrs['use_mkldnn'] = True fc_op_new = self.block._insert_op( index, type='fc', inputs=fc_inputs, outputs=fc_outputs, attrs=fc_attrs) return fc_op_new def _fuse_conv_eltwise(self, index, conv_op, eltwise_op): ''' fuse the conv op with elementwise_add :param conv_op: convolution operator :type conv_op: Operator :param eltwise_op: operator adding data from skip connection :type eltwise_op: Operator ''' eltwise_input = "X" if eltwise_op.input("X")[0] == conv_op.output("Output")[0]: eltwise_input = "Y" residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]] out_var = self.block.vars[eltwise_op.output("Out")[0]] filter_var = self.block.vars[conv_op.input("Filter")[0]] in_var = self.block.vars[conv_op.input("Input")[0]] bias_var = self.block.vars[conv_op.input("Bias")[0]] conv_op._set_attr("fuse_residual_connection", True) attrs = {name: conv_op.attr(name) for name in conv_op.attr_names} self.block._insert_op( index, type="conv2d", inputs={ "Input": in_var, "Filter": filter_var, "Bias": bias_var, "ResidualData": residual_var }, outputs={"Output": out_var}, attrs=attrs) def _adjust_input(self): for i in range(len(self.block.ops)): current_op = self.block.ops[i] for input_arg in current_op.input_arg_names: if input_arg in self.input_map: current_op._rename_input(input_arg, self.input_map[input_arg]) def _remove_unused_var(self): ''' remove unused varibles in program ''' args = [] for i in range(len(self.block.ops)): current_op = self.block.ops[i] args += current_op.input_arg_names args += current_op.output_arg_names args = list(set(args)) # unique the input and output arguments for var in list(self.block.vars.keys()): if var not in args: self.block._remove_var(var)