From d0a921ba98b6cc3012e04606f7846db378eb4275 Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Mon, 6 Jul 2020 07:27:57 +0200 Subject: [PATCH] Quant2 updates and fixes (#25313) --- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 16 +- .../quantization/quant2_int8_mkldnn_pass.py | 59 +++++--- .../fluid/contrib/slim/tests/CMakeLists.txt | 36 ++--- ...t2_int8_image_classification_comparison.py | 142 +++++++++++------- .../slim/tests/quant2_int8_nlp_comparison.py | 97 ++++++++---- .../contrib/slim/tests/save_quant_model.py | 27 +--- 6 files changed, 228 insertions(+), 149 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 246cbf36e09..9881f7f9e56 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -46,10 +46,8 @@ void LogCannotQuantizeOp(Node* op, const char* details = nullptr) { } void LogScaleIsMissingForVar(Node* var) { - std::stringstream msg_ss; - msg_ss << "Quantization scale for the variable " << var->Name() - << " is missing."; - PrettyLogDetail(msg_ss.str().c_str()); + VLOG(4) << "Quantization scale for the variable " << var->Name() + << " is missing."; } void LogQuantizationDisabled(Node* op) { @@ -256,6 +254,14 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + auto has_output_scale = AreScalesPresentForNodes(conv_op, {conv_output}); + if (with_residual_data && !has_output_scale) { + LogCannotQuantizeOp(conv_op, + "Conv op with ResidualData input cannot be quantized " + "without output scale."); + return; + } + if (with_residual_data) { GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, conv_pattern); @@ -294,7 +300,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, conv_op->Op()->SetAttr("Scale_weights", filter_scale); // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes(conv_op, {conv_output})) { + if (has_output_scale) { bool is_output_unsigned{false}; auto output_scale = GetScaleValueForNode(conv_output, &is_output_unsigned); diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 0d1e986d4f7..75e1ea43d15 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -55,7 +55,7 @@ class Quant2Int8MkldnnPass(object): 'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs' ] self._ops_to_quantize = _ops_to_quantize - self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip != None else set( + self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set( [-1]) self._scale_immutable_ops = [ 'transpose2', 'reshape2', 'pool2d', 'scale' @@ -71,11 +71,14 @@ class Quant2Int8MkldnnPass(object): self._var_quant_scales = {} self._max_range = {} self._s8_max = 127 + self._pass_idx = 0 + self._pass_group = 'int8' def apply(self, graph): assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' + self._reset_pass_idx_and_group('int8') graph = self._gather_weight_scales_from_fake(graph) graph = self._gather_output_scales_from_attr(graph) graph = self._gather_input_scales_from_fake(graph) @@ -86,21 +89,24 @@ class Quant2Int8MkldnnPass(object): graph = self._update_relu_output_scales(graph) graph = self._propagate_scales(graph) graph = self._quantize_fp32_graph(graph) - graph = self._optimize_int8_graph(graph) + graph = self._final_optimizations(graph) graph = self._cleanup(graph) return graph - def apply_fp32(self, graph): + def prepare_and_optimize_fp32(self, graph): assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' - graph = self._gather_weight_scales_from_fake(graph) - graph = self._remove_fake_ops(graph) - graph = self._dequantize_weights(graph) + self._reset_pass_idx_and_group('fp32') graph = self._optimize_fp32_graph(graph) + graph = self._final_optimizations(graph) graph = self._cleanup(graph) return graph + def _reset_pass_idx_and_group(self, group): + self._pass_idx = 0 + self._pass_group = group + def _convert_scale2tensor(self, scale): tensor = core.LoDTensor() tensor.set(scale, core.CPUPlace()) @@ -333,20 +339,38 @@ class Quant2Int8MkldnnPass(object): def _optimize_fp32_graph(self, graph): graph = self._update_activations(graph) graph = self._remove_ctrl_vars(graph) + graph = self._apply_pass(graph, 'attention_lstm_fuse_pass') + graph = self._apply_pass(graph, 'seqconv_eltadd_relu_fuse_pass') + # graph = self._apply_pass(graph, 'seqpool_concat_fuse_pass') + graph = self._apply_pass(graph, 'seqpool_cvm_concat_fuse_pass') + # graph = self._apply_pass(graph, 'embedding_fc_lstm_fuse_pass') + graph = self._apply_pass(graph, 'fc_lstm_fuse_pass') + graph = self._apply_pass(graph, 'mul_lstm_fuse_pass') + graph = self._apply_pass(graph, 'fc_gru_fuse_pass') + graph = self._apply_pass(graph, 'mul_gru_fuse_pass') + graph = self._apply_pass(graph, 'seq_concat_fc_fuse_pass') + graph = self._apply_pass(graph, 'squared_mat_sub_fuse_pass') + graph = self._apply_pass(graph, 'is_test_pass') graph = self._apply_pass(graph, 'mkldnn_placement_pass', ['mkldnn_enabled_op_types'], [set()]) graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass') graph = self._apply_pass(graph, 'conv_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass') + graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass') + graph = self._apply_pass(graph, + 'conv_transpose_eltwiseadd_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_bias_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'conv_elementwise_add_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'conv_relu_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'conv_relu6_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'fc_fuse_pass', ['use_gpu', 'use_fc_padding'], [False, False]) + graph = self._apply_pass(graph, 'repeated_fc_relu_fuse_pass') if self._is_fc_quantized(graph): graph = self._apply_pass(graph, 'fc_mkldnn_pass') graph = self._apply_pass(graph, 'matmul_transpose_reshape_fuse_pass') + # the following pass should be the last one since it will work on all fused ops. + graph = self._apply_pass(graph, 'runtime_context_cache_pass') return graph def _apply_pass(self, graph, pass_name, attrs=None, attr_values=None): @@ -362,12 +386,13 @@ class Quant2Int8MkldnnPass(object): ir_pass.set(attr, value) ir_pass.apply(cpp_graph) if self._debug: - graph.draw('.', 'quant_fp32_{}'.format(pass_name), - graph.all_op_nodes()) + graph.draw('.', '{}_{}_{}'.format(self._pass_group, self._pass_idx, + pass_name), graph.all_op_nodes()) self._remove_unused_var_nodes(graph) + self._pass_idx += 1 return graph - def _optimize_int8_graph(self, graph): + def _final_optimizations(self, graph): # remove dropout ops graph = self._apply_pass(graph, 'simplify_with_basic_ops_pass') # make some MKL-DNN ops working inplace @@ -448,8 +473,7 @@ class Quant2Int8MkldnnPass(object): self._var_quant_scales[out_name] = (True, tensor) return graph - conv_predicate = lambda op: op.attr("fuse_activation") in self._relu_ops and \ - op.attr("fuse_residual_connection") == False + conv_predicate = lambda op: op.attr("fuse_activation") in self._relu_ops graph = _set_unsigned_scale(graph, self._conv_ops, "Output", conv_predicate) @@ -465,15 +489,10 @@ class Quant2Int8MkldnnPass(object): return 'NHWC' if self._is_conv_quantized(graph) else 'NCHW' def _quantize_fp32_graph(self, graph): - ir_pass = self._core.get_pass('cpu_quantize_placement_pass') - cpp_graph = graph.graph - ir_pass.set('quantize_enabled_op_types', self._ops_to_quantize) - ir_pass.set('quantize_excluded_op_ids', - self._find_avg_pooling_ids(graph)) - ir_pass.apply(cpp_graph) - if self._debug: - graph.draw('.', 'quant_int8_{}'.format(ir_pass.type()), - graph.all_op_nodes()) + graph = self._apply_pass( + graph, 'cpu_quantize_placement_pass', + ['quantize_enabled_op_types', 'quantize_excluded_op_ids'], + [self._ops_to_quantize, self._find_avg_pooling_ids(graph)]) graph = self._apply_pass(graph, 'scale_matmul_fuse_pass') graph = self._apply_pass(graph, 'reshape_transpose_matmul_mkldnn_fuse_pass') diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index f22ef5b3cd3..e85e8ae15bf 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -57,7 +57,7 @@ endfunction() # set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25 -function(inference_quant2_int8_image_classification_test target quant_model_dir fp32_model_dir dataset_path ops_to_quantize) +function(inference_quant2_int8_image_classification_test target quant_model_dir fp32_model_dir dataset_path) py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_image_classification_comparison.py" ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} @@ -67,12 +67,11 @@ function(inference_quant2_int8_image_classification_test target quant_model_dir --infer_data ${dataset_path} --batch_size 10 --batch_num 2 - --acc_diff_threshold 0.1 - --ops_to_quantize ${ops_to_quantize}) + --acc_diff_threshold 0.1) endfunction() # set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20 -function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir dataset_path labels_path) +function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir dataset_path labels_path ops_to_quantize) py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_nlp_comparison.py" ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} @@ -83,7 +82,8 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da --labels ${labels_path} --batch_size 10 --batch_num 2 - --acc_diff_threshold 0.1) + --acc_diff_threshold 0.1 + --ops_to_quantize ${ops_to_quantize}) endfunction() function(download_quant_data install_dir data_file) @@ -98,20 +98,20 @@ function(download_quant_model install_dir data_file) endif() endfunction() -function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize) +function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path) py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py ARGS --quant_model_path ${quant_model_dir} --fp32_model_save_path ${fp32_model_save_path} --int8_model_save_path ${int8_model_save_path} - --ops_to_quantize ${ops_to_quantize} --debug) endfunction() -function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path) +function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize) py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py ARGS --quant_model_path ${quant_model_dir} --fp32_model_save_path ${fp32_model_save_path} - --int8_model_save_path ${int8_model_save_path}) + --int8_model_save_path ${int8_model_save_path} + --ops_to_quantize ${ops_to_quantize}) endfunction() function(convert_model2dot_test target model_path save_graph_dir save_graph_name) @@ -224,36 +224,34 @@ if(LINUX AND WITH_MKLDNN) ### Quant2 for image classification - set(QUANT2_IC_OPS_TO_QUANTIZE "conv2d,pool2d") - # Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators, # with weight scales in `fake_dequantize_max_abs` operators set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2") set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz") download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE}) set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50") - inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QUANT2_IC_OPS_TO_QUANTIZE}) + inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, # with weight scales in `fake_dequantize_max_abs` operators set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range") set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz") download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE}) - inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QUANT2_IC_OPS_TO_QUANTIZE}) + inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, # with weight scales in `fake_channel_wise_dequantize_max_abs` operators set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise") set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz") download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}) - inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QUANT2_IC_OPS_TO_QUANTIZE}) + inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) # Quant2 MobileNetV1 set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2") set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz") download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE}) set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1") - inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QUANT2_IC_OPS_TO_QUANTIZE}) + inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) ### Quant2 for NLP @@ -263,6 +261,8 @@ if(LINUX AND WITH_MKLDNN) set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev") download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE}) + set(QUANT2_NLP_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add") + # Quant2 Ernie set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz") set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2") @@ -270,17 +270,17 @@ if(LINUX AND WITH_MKLDNN) set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz") set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float") download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE}) - inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH}) + inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE}) ### Save FP32 model or INT8 model from Quant model set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8") set(QUANT2_FP32_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_fp32") - save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH} ${QUANT2_IC_OPS_TO_QUANTIZE}) + save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH}) set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8") set(QUANT2_FP32_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_fp32") - save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH}) + save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE}) # Convert Quant2 model to dot and pdf files set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file") diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py index 2d92e3a02f9..77c925a1b11 100644 --- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py +++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py @@ -167,7 +167,8 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): batch_size=1, batch_num=1, skip_batch_num=0, - transform_to_int8=False): + target='quant'): + assert target in ['quant', 'int8', 'fp32'] place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() @@ -183,17 +184,19 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if (self._debug): graph.draw('.', 'quant_orig', graph.all_op_nodes()) - if (transform_to_int8): - transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass( - self._quantized_ops, - _op_ids_to_skip=self._op_ids_to_skip, - _scope=inference_scope, - _place=place, - _core=core, - _debug=self._debug) - graph = transform_to_mkldnn_int8_pass.apply(graph) - else: + quant_transform_pass = Quant2Int8MkldnnPass( + self._quantized_ops, + _op_ids_to_skip=self._op_ids_to_skip, + _scope=inference_scope, + _place=place, + _core=core, + _debug=self._debug) + if (target == 'quant'): graph = self._prepare_for_fp32_mkldnn(graph) + elif (target == 'int8'): + graph = quant_transform_pass.apply(graph) + else: # target == fp32 + graph = quant_transform_pass.prepare_and_optimize_fp32(graph) inference_program = graph.to_program() @@ -222,18 +225,7 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): images = np.array(images).astype('float32') labels = np.array([x[1] for x in data]).astype('int64') - if (transform_to_int8 == True): - # INT8 models obtained from Quant models do not have accuracy measuring layers - start = time.time() - out = exe.run(inference_program, - feed={feed_target_names[0]: images}, - fetch_list=fetch_targets) - batch_time = (time.time() - start) * 1000 # in miliseconds - outputs.append(out[0]) - # Calculate accuracy result - batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0], - labels) - else: + if (target == 'fp32'): # FP32 models have accuracy measuring layers labels = labels.reshape([-1, 1]) start = time.time() @@ -246,6 +238,18 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): batch_time = (time.time() - start) * 1000 # in miliseconds batch_acc1, batch_acc5 = out[1][0], out[2][0] outputs.append(batch_acc1) + else: + # Quant INT8 models do not have accuracy measuring layers + start = time.time() + out = exe.run(inference_program, + feed={feed_target_names[0]: images}, + fetch_list=fetch_targets) + batch_time = (time.time() - start) * 1000 # in miliseconds + outputs.append(out[0]) + # Calculate accuracy result + batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0], + labels) + infer_accs1.append(batch_acc1) infer_accs5.append(batch_acc5) samples = len(data) @@ -274,28 +278,37 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg - def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat): + def _print_performance(self, title, fps, lat): + _logger.info('{0}: avg fps: {1:.2f}, avg latency: {2:.4f} ms'.format( + title, fps, lat)) + + def _print_accuracy(self, title, acc1, acc5): + _logger.info( + '{0}: avg top1 accuracy: {1:.4f}, avg top5 accuracy: {2:.4f}'. + format(title, acc1, acc5)) + + def _summarize_performance(self, int8_fps, int8_lat, fp32_fps, fp32_lat): _logger.info('--- Performance summary ---') - _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format( - fp32_fps, fp32_lat)) - _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format( - int8_fps, int8_lat)) + self._print_performance('INT8', int8_fps, int8_lat) + if fp32_lat >= 0: + self._print_performance('FP32', fp32_fps, fp32_lat) - def _compare_accuracy(self, fp32_acc1, fp32_acc5, int8_acc1, int8_acc5, - threshold): + def _summarize_accuracy(self, quant_acc1, quant_acc5, int8_acc1, int8_acc5, + fp32_acc1, fp32_acc5): _logger.info('--- Accuracy summary ---') + self._print_accuracy('Quant', quant_acc1, quant_acc5) + self._print_accuracy('INT8', int8_acc1, int8_acc5) + if fp32_acc1 >= 0: + self._print_accuracy('FP32', fp32_acc1, fp32_acc5) + + def _compare_accuracy(self, threshold, quant_acc1, int8_acc1): _logger.info( - 'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)' + 'Accepted top1 accuracy drop threshold: {0}. (condition: (Quant_top1_acc - IN8_top1_acc) <= threshold && Quant_top1_acc > 0.5 && INT8_top1_acc > 0.5)' .format(threshold)) - _logger.info( - 'FP32: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'. - format(fp32_acc1, fp32_acc5)) - _logger.info( - 'INT8: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'. - format(int8_acc1, int8_acc5)) - assert fp32_acc1 > 0.0 - assert int8_acc1 > 0.0 - assert fp32_acc1 - int8_acc1 <= threshold + # We assume valid accuracy to be at least 0.5 + assert quant_acc1 > 0.5 + assert int8_acc1 > 0.5 + assert quant_acc1 - int8_acc1 <= threshold def test_graph_transformation(self): if not fluid.core.is_compiled_with_mkldnn(): @@ -303,10 +316,9 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): quant_model_path = test_case_args.quant_model assert quant_model_path, 'The Quant model path cannot be empty. Please, use the --quant_model option.' - fp32_model_path = test_case_args.fp32_model - assert fp32_model_path, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.' data_path = test_case_args.infer_data assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.' + fp32_model_path = test_case_args.fp32_model batch_size = test_case_args.batch_size batch_num = test_case_args.batch_num skip_batch_num = test_case_args.skip_batch_num @@ -323,9 +335,10 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): self._op_ids_to_skip = set( map(int, test_case_args.op_ids_to_skip.split(','))) - _logger.info('FP32 & Quant INT8 prediction run.') + _logger.info('Quant & INT8 prediction run.') _logger.info('Quant model: {}'.format(quant_model_path)) - _logger.info('FP32 model: {}'.format(fp32_model_path)) + if fp32_model_path: + _logger.info('FP32 model: {}'.format(fp32_model_path)) _logger.info('Dataset: {}'.format(data_path)) _logger.info('Batch size: {}'.format(batch_size)) _logger.info('Batch number: {}'.format(batch_num)) @@ -336,17 +349,20 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip else 'none')) - _logger.info('--- FP32 prediction start ---') + _logger.info('--- Quant prediction start ---') val_reader = paddle.batch( self._reader_creator(data_path), batch_size=batch_size) - fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict( + quant_output, quant_acc1, quant_acc5, quant_fps, quant_lat = self._predict( val_reader, - fp32_model_path, + quant_model_path, batch_size, batch_num, skip_batch_num, - transform_to_int8=False) - _logger.info('--- Quant INT8 prediction start ---') + target='quant') + self._print_performance('Quant', quant_fps, quant_lat) + self._print_accuracy('Quant', quant_acc1, quant_acc5) + + _logger.info('--- INT8 prediction start ---') val_reader = paddle.batch( self._reader_creator(data_path), batch_size=batch_size) int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict( @@ -355,11 +371,29 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase): batch_size, batch_num, skip_batch_num, - transform_to_int8=True) - - self._summarize_performance(fp32_fps, fp32_lat, int8_fps, int8_lat) - self._compare_accuracy(fp32_acc1, fp32_acc5, int8_acc1, int8_acc5, - acc_diff_threshold) + target='int8') + self._print_performance('INT8', int8_fps, int8_lat) + self._print_accuracy('INT8', int8_acc1, int8_acc5) + + fp32_acc1 = fp32_acc5 = fp32_fps = fp32_lat = -1 + if fp32_model_path: + _logger.info('--- FP32 prediction start ---') + val_reader = paddle.batch( + self._reader_creator(data_path), batch_size=batch_size) + fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict( + val_reader, + fp32_model_path, + batch_size, + batch_num, + skip_batch_num, + target='fp32') + self._print_performance('FP32', fp32_fps, fp32_lat) + self._print_accuracy('FP32', fp32_acc1, fp32_acc5) + + self._summarize_performance(int8_fps, int8_lat, fp32_fps, fp32_lat) + self._summarize_accuracy(quant_acc1, quant_acc5, int8_acc1, int8_acc5, + fp32_acc1, fp32_acc5) + self._compare_accuracy(acc_diff_threshold, quant_acc1, int8_acc1) if __name__ == '__main__': diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py index ff50cd683be..640d500152d 100644 --- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py +++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py @@ -17,8 +17,6 @@ import os import sys import argparse import logging -import struct -import six import numpy as np import time import paddle @@ -143,7 +141,8 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): batch_size=1, batch_num=1, skip_batch_num=0, - transform_to_int8=False): + target='quant'): + assert target in ['quant', 'int8', 'fp32'] place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() @@ -159,15 +158,19 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): graph = IrGraph(core.Graph(inference_program.desc), for_test=True) if (self._debug): graph.draw('.', 'quant_orig', graph.all_op_nodes()) - if (transform_to_int8): - transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass( + if (target != 'quant'): + quant_transform_pass = Quant2Int8MkldnnPass( self._quantized_ops, _op_ids_to_skip=self._op_ids_to_skip, _scope=inference_scope, _place=place, _core=core, _debug=self._debug) - graph = transform_to_mkldnn_int8_pass.apply(graph) + if (target == 'int8'): + graph = quant_transform_pass.apply(graph) + else: # target == fp32 + graph = quant_transform_pass.prepare_and_optimize_fp32( + graph) inference_program = graph.to_program() @@ -223,26 +226,35 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): return acc_avg, pps_avg, latency_avg - def _summarize_performance(self, fp32_pps, fp32_lat, int8_pps, int8_lat): - _logger.info('--- Performance summary ---') - _logger.info( - 'FP32: avg predictions per sec: {0:.2f}, avg latency: {1:.4f} ms'. - format(fp32_pps, fp32_lat)) + def _print_performance(self, title, pps, lat): _logger.info( - 'INT8: avg predictions per sec: {0:.2f}, avg latency: {1:.4f} ms'. - format(int8_pps, int8_lat)) + '{0}: avg predictions per sec: {1:.2f}, avg latency: {2:.4f} ms'. + format(title, pps, lat)) + + def _print_accuracy(self, title, acc): + _logger.info('{0}: avg accuracy: {1:.6f}'.format(title, acc)) + + def _summarize_performance(self, int8_pps, int8_lat, fp32_pps, fp32_lat): + _logger.info('--- Performance summary ---') + self._print_performance('INT8', int8_pps, int8_lat) + if fp32_lat >= 0: + self._print_performance('FP32', fp32_pps, fp32_lat) - def _compare_accuracy(self, fp32_acc, int8_acc, threshold): + def _summarize_accuracy(self, quant_acc, int8_acc, fp32_acc): _logger.info('--- Accuracy summary ---') + self._print_accuracy('Quant', quant_acc) + self._print_accuracy('INT8', int8_acc) + if fp32_acc >= 0: + self._print_accuracy('FP32', fp32_acc) + + def _compare_accuracy(self, threshold, quant_acc, int8_acc): _logger.info( - 'Accepted accuracy drop threshold: {0}. (condition: (FP32_acc - INT8_acc) <= threshold)' + 'Accepted accuracy drop threshold: {0}. (condition: (Quant_acc - INT8_acc) <= threshold)' .format(threshold)) - _logger.info('FP32: avg accuracy: {0:.6f}'.format(fp32_acc)) - _logger.info('INT8: avg accuracy: {0:.6f}'.format(int8_acc)) # Random outputs give accuracy about 0.33, we assume valid accuracy to be at least 0.5 - assert fp32_acc > 0.5 + assert quant_acc > 0.5 assert int8_acc > 0.5 - assert fp32_acc - int8_acc <= threshold + assert quant_acc - int8_acc <= threshold def test_graph_transformation(self): if not fluid.core.is_compiled_with_mkldnn(): @@ -250,9 +262,9 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): quant_model_path = test_case_args.quant_model assert quant_model_path, 'The Quant model path cannot be empty. Please, use the --quant_model option.' - fp32_model_path = test_case_args.fp32_model if test_case_args.fp32_model else quant_model_path data_path = test_case_args.infer_data assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.' + fp32_model_path = test_case_args.fp32_model labels_path = test_case_args.labels batch_size = test_case_args.batch_size batch_num = test_case_args.batch_num @@ -270,9 +282,10 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): self._op_ids_to_skip = set( map(int, test_case_args.op_ids_to_skip.split(','))) - _logger.info('FP32 & Quant INT8 prediction run.') + _logger.info('Quant & INT8 prediction run.') _logger.info('Quant model: {}'.format(quant_model_path)) - _logger.info('FP32 model: {}'.format(fp32_model_path)) + if fp32_model_path: + _logger.info('FP32 model: {}'.format(fp32_model_path)) _logger.info('Dataset: {}'.format(data_path)) _logger.info('Labels: {}'.format(labels_path)) _logger.info('Batch size: {}'.format(batch_size)) @@ -284,18 +297,20 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip else 'none')) - _logger.info('--- FP32 prediction start ---') + _logger.info('--- Quant prediction start ---') val_reader = paddle.batch( self._reader_creator(data_path, labels_path), batch_size=batch_size) - fp32_acc, fp32_pps, fp32_lat = self._predict( + quant_acc, quant_pps, quant_lat = self._predict( val_reader, - fp32_model_path, + quant_model_path, batch_size, batch_num, skip_batch_num, - transform_to_int8=False) - _logger.info('FP32: avg accuracy: {0:.6f}'.format(fp32_acc)) - _logger.info('--- Quant INT8 prediction start ---') + target='quant') + self._print_performance('Quant', quant_pps, quant_lat) + self._print_accuracy('Quant', quant_acc) + + _logger.info('--- INT8 prediction start ---') val_reader = paddle.batch( self._reader_creator(data_path, labels_path), batch_size=batch_size) int8_acc, int8_pps, int8_lat = self._predict( @@ -304,11 +319,29 @@ class QuantInt8NLPComparisonTest(unittest.TestCase): batch_size, batch_num, skip_batch_num, - transform_to_int8=True) - _logger.info('INT8: avg accuracy: {0:.6f}'.format(int8_acc)) + target='int8') + self._print_performance('INT8', int8_pps, int8_lat) + self._print_accuracy('INT8', int8_acc) + + fp32_acc = fp32_pps = fp32_lat = -1 + if fp32_model_path: + _logger.info('--- FP32 prediction start ---') + val_reader = paddle.batch( + self._reader_creator(data_path, labels_path), + batch_size=batch_size) + fp32_acc, fp32_pps, fp32_lat = self._predict( + val_reader, + fp32_model_path, + batch_size, + batch_num, + skip_batch_num, + target='fp32') + self._print_performance('FP32', fp32_pps, fp32_lat) + self._print_accuracy('FP32', fp32_acc) - self._summarize_performance(fp32_pps, fp32_lat, int8_pps, int8_lat) - self._compare_accuracy(fp32_acc, int8_acc, acc_diff_threshold) + self._summarize_performance(int8_pps, int8_lat, fp32_pps, fp32_lat) + self._summarize_accuracy(quant_acc, int8_acc, fp32_acc) + self._compare_accuracy(acc_diff_threshold, quant_acc, int8_acc) if __name__ == '__main__': diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index ae880ef5452..dab4b63cda4 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -35,11 +35,6 @@ def parse_args(): type=str, default='', help='A path to a Quant model.') - parser.add_argument( - '--fp32_model_save_path', - type=str, - default='', - help='Saved optimized fp32 model') parser.add_argument( '--int8_model_save_path', type=str, @@ -65,7 +60,7 @@ def parse_args(): return test_args, sys.argv[:1] + args -def transform_and_save_model(original_path, save_path, save_type): +def transform_and_save_int8_model(original_path, save_path): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() @@ -96,26 +91,18 @@ def transform_and_save_model(original_path, save_path, save_type): _place=place, _core=core, _debug=test_args.debug) - - graph = IrGraph(core.Graph(inference_program.desc), for_test=True) - if save_type == 'FP32': - graph = transform_to_mkldnn_int8_pass.apply_fp32(graph) - elif save_type == 'INT8': - graph = transform_to_mkldnn_int8_pass.apply(graph) + graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): fluid.io.save_inference_model(save_path, feed_target_names, fetch_targets, exe, inference_program) - print("Success! Transformed Quant_{0} model can be found at {1}\n". - format(save_type, save_path)) + print( + "Success! INT8 model obtained from the Quant model can be found at {}\n" + .format(save_path)) if __name__ == '__main__': global test_args test_args, remaining_args = parse_args() - if test_args.fp32_model_save_path: - transform_and_save_model(test_args.quant_model_path, - test_args.fp32_model_save_path, 'FP32') - if test_args.int8_model_save_path: - transform_and_save_model(test_args.quant_model_path, - test_args.int8_model_save_path, 'INT8') + transform_and_save_int8_model(test_args.quant_model_path, + test_args.int8_model_save_path) -- GitLab