diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 1e6555bb02033a28dedd2a1d1962981dfcc97cc2..1a685b9e2ebcd7d4b5a057b506bccf6adcd9952c 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -62,5 +62,21 @@ TEST(inference, image_classification) { LOG(INFO) << output2.dims(); CheckError(output1, output2); + + // float16 inference requires cuda GPUs with >= 5.3 compute capability + if (paddle::platform::GetCUDAComputeCapability(0) >= 53) { + paddle::framework::LoDTensor output3; + std::vector cpu_fetchs3; + cpu_fetchs3.push_back(&output3); + + LOG(INFO) << "--- GPU Runs in float16 mode: ---"; + std::string fp16_dirname = dirname; + fp16_dirname.replace(fp16_dirname.find("book/"), + std::string("book/").size(), "book/float16_"); + TestInference( + fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat); + + CheckError(output2, output3); + } #endif } diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 340882ea9e7b0e2a0c52749c771308c6b860ed07..53486ecffc8dbdcbe93ae12c4f6ebb53c79bce47 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1070,16 +1070,25 @@ class Program(object): for t in targets: if not isinstance(t, Operator): if isinstance(t, Variable): - if t.op is None: - global_block = self.global_block() - for op in global_block.ops: - if t.name in op.output_arg_names: - t.op = op - break + # After transpiler processing, the op that output this + # variable maybe has been changed, so t.op is not reliable + # and we need to find the current op that generate this + # variable here. + t.op = None + global_block = self.global_block() + for idx, op in enumerate(global_block.ops): + if t.name in op.output_arg_names: + t.op = op + break + t = t.op + if t is None: + raise ValueError( + "The target variable must have an " + "associated operator that generates it.") else: - raise ValueError(("All targets of prune() can only be " - "Variable or Operator.")) + raise ValueError("All targets of prune() can only be " + "Variable or Operator.") targets_idx.append([t.block.idx, t.idx]) res = Program() diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/inference_transpiler.py index 39b01610f96018e1775405a30147e77006cecc16..f4ad717b9e72e281940fab0cfd06296306b587fc 100644 --- a/python/paddle/fluid/inference_transpiler.py +++ b/python/paddle/fluid/inference_transpiler.py @@ -121,7 +121,60 @@ class InferenceTranspiler: # And a better solution will be considered later. program = program.clone() + def float16_transpile(self, program, place, scope=None): + ''' + Transpile the program desc and cast the weights to float16 data type to + enable float16 inference. + + Since the operator in a program desc will automatically choose the + right compute kernel to run based on the data type of the input tensor. + We actually don't need to change the program desc to run in float16 mode. + + However, in this way, users who are used to feeding and fetching tensors + of float32 data type when running typical inference may find it confusing + and difficult to run inference in float16 mode as they need to convert + input data to float16 dtype and then convert the results back to float32 + dtype to match the rest of code. + + So this function appends cast ops to the program desc where necessary so + that users are able to run inference in float16 mode while providing input + tensor (feed_holder) of float data type and obtaining output tensor + (fetch_holder) of float data type. + + Moreover, it is desired that when we have the scope and program desc to run + inference in float32 mode, we can use a single API to do the necessary + modification and then user can run float16 inference on the fly. To make + this happen, this function also create new parameters in the scope to have the + converted float16 weights and change the operators in program desc to use + these new parameters. + + :param program: program to transpile + :type program: Program + :param place: inference place + :type place: Place + :param scope: inference scope + :type scope: Scope + ''' + if scope is None: + scope = global_scope() + + self.scope = scope + self.place = place + self.block = program.block(0) + self.input_map = {} # store the input names should be adjusted + + self._modify_feed_fetch() + self._convert_param_to_float16() + self._adjust_input(skip=True) + self._remove_unused_var() + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + # ====================== private transpiler functions ===================== + def _insert_bias_op(self, index, current_op, bn_op): ''' Construct elementwise_add operator for adding bias @@ -216,9 +269,27 @@ class InferenceTranspiler: # collect the renamed input self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] - def _adjust_input(self): + def _adjust_input(self, skip=False): + ''' + Change the input variable name in operators. + + When we are in the process of modifying a program desc, we usually + replace some variables with some other variables, where we create + a dictionary input_map to record the one-to-one correspondence + between each old variable and the new one. + + After that, this function will search all the operators that use the + old variables and change the info in op to use the new variables. There + maybe some exceptions to this rule when we are using the float16 transpiler + and insert cast ops to cast float32 variable to float16 one. After we + insert the cast op to cast var_1 to var_1_fp16, we don't want to change + the input of cast op to var_1_fp16 after using this function. + ''' + skip_ops = {"cast"} for i in range(len(self.block.ops)): current_op = self.block.ops[i] + if skip and current_op.type in skip_ops: + continue for input_arg in current_op.input_arg_names: if input_arg in self.input_map: current_op.rename_input(input_arg, @@ -238,3 +309,138 @@ class InferenceTranspiler: for var in self.block.vars.keys(): if var not in args: self.block.remove_var(var) + + def _modify_feed_fetch(self): + ''' + Modify feed fetch op/vars for float16 inference. + + For each feed op: + feed_op->feed_target_var + + Change it to: + feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var + + For each fetch op: + fetch_target_var->fetch_op + + Change it to: + tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op + + :return: None + ''' + + def find_op(var): + # It is possible that var.op is not up to date after some + # modifications to program desc. Here we force to make it up to date. + var.op = None + for op in self.block.ops: + if var.name in op.output_arg_names: + var.op = op + break + + if var.op is None: + raise ValueError("The target variable must have an " + "associated operator that generates it.") + + i = 0 + while i < len(self.block.ops): + cur_op = self.block.ops[i] + if cur_op.type == "feed": + var_name = cur_op.output("Out")[0] + tmp_var_name = var_name + ".fp16" + var = self.block.vars[var_name] + tmp_var = self.block.create_var( + name=tmp_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape, + persistable=var.persistable) + self.block.insert_op( + i + 1, + type="cast", + inputs={"X": var}, + outputs={"Out": tmp_var}, + attrs={ + 'in_dtype': int(var.dtype), + 'out_dtype': int(tmp_var.dtype) + }) + self.input_map[var_name] = tmp_var_name + i = i + 1 + elif cur_op.type == "fetch": + var_name = cur_op.input("X")[0] + tmp_var_name = var_name + ".fp16" + var = self.block.vars[var_name] + tmp_var = self.block.create_var( + name=tmp_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape, + persistable=var.persistable) + find_op(var) + var.op.rename_output(var_name, tmp_var_name) + self.block.insert_op( + i, + type="cast", + inputs={"X": tmp_var}, + outputs={"Out": var}, + attrs={ + 'in_dtype': int(tmp_var.dtype), + 'out_dtype': int(var.dtype) + }) + i = i + 1 + i = i + 1 + + def _convert_param_to_float16(self): + def _get_no_fp16_conversion_var_names(): + ''' + Get the set of input variable names that shouldn't be converted to float16. + + When we want to run inference in float16 mode, most parameters need to be + firstly converted to float16. However, there are some parameters that + shouldn't be converted to float16 because the corresponding operator + requires float32 parameters even in float16 mode (when the input data is + of float16 data type). Currently, the only operator that has this exclusion + is the batch norm op. + + :return: set of input variable names + :type var_names: set + ''' + op_names = {'batch_norm'} + var_names = [] + for op in self.block.ops: + if op.type in op_names: + var_names += op.input_arg_names + return set(var_names) + + def _should_be_converted(var): + return var.persistable and \ + var.name not in self.no_conversion_vars and \ + var.type != core.VarDesc.VarType.FEED_MINIBATCH and \ + var.type != core.VarDesc.VarType.FETCH_LIST + + self.no_conversion_vars = _get_no_fp16_conversion_var_names() + conversion_var_list = filter(_should_be_converted, + self.block.vars.values()) + for var in conversion_var_list: + fp16_var_name = var.name + ".fp16" + fp16_var = self.block.create_parameter( + name=fp16_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape) + + # cast the data in the tensor of the original var to float16 + # data type and store it in the tensor of the new float16 var + self.scope.var(fp16_var_name) + fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor() + tensor = np.array(self.scope.find_var(var.name).get_tensor()) + # After the old tensor data is converted to np.float16, view(np.uint16) + # is used so that the internal memory of the numpy array will be + # reinterpreted to be of np.uint16 data type, which is binded to fluid + # float16 data type via the help of pybind in tensor_py.h. + fp16_tensor.set( + tensor.astype(np.float16).view(np.uint16), self.place) + + # old var will be replaced by the fp16 var in program desc + self.input_map[var.name] = fp16_var_name + self.block.remove_var(var.name) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index f7f1ca2598a3e679b24fa8d62c52e4f4de788fe2..08b8a878b6490bc989620085f3f9c06c7032d882 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -336,7 +336,7 @@ def save_inference_model(dirname, if main_program is None: main_program = default_main_program() - copy_program = main_program + copy_program = main_program.clone() if not os.path.isdir(dirname): os.makedirs(dirname) diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index db96c82ce2d8376b029e9dcc54ffab669f1def9a..09f994c37020c4612223e4168be4cf535157f60b 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -252,6 +252,26 @@ def infer(use_cuda, save_dirname=None): fetch_targets, exe, inference_transpiler_program) + if use_cuda and fluid.core.is_float16_supported(place): + # Use float16_transpiler to speedup + fp16_transpiler_program = inference_transpiler_program.clone() + t.float16_transpile(fp16_transpiler_program, place) + + fp16_results = exe.run(fp16_transpiler_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + + assert len(results[0]) == len(fp16_results[0]) + for i in range(len(results[0])): + np.testing.assert_almost_equal( + results[0][i], fp16_results[0][i], decimal=2) + + print("float16 infer results: ", fp16_results[0]) + + fluid.io.save_inference_model("float16_" + save_dirname, + feed_target_names, fetch_targets, exe, + fp16_transpiler_program) + def main(net_type, use_cuda, is_local=True): if use_cuda and not fluid.core.is_compiled_with_cuda():