diff --git a/demo/quant/pact_quant_aware/train.py b/demo/quant/pact_quant_aware/train.py
index fb70c0fc22fae462249b54a02145749b2ef40a1b..67945a455d261681046f42caa19e8f1c18a37380 100644
--- a/demo/quant/pact_quant_aware/train.py
+++ b/demo/quant/pact_quant_aware/train.py
@@ -65,6 +65,8 @@ add_arg('use_pact',          bool, True,
         "Whether to use PACT or not.")
 add_arg('analysis',          bool, False,
         "Whether analysis variables distribution.")
+add_arg('onnx_format',          bool, False,
+        "Whether use onnx format or not.")
 add_arg('ce_test',                 bool,   False,       "Whether to CE test.")
 
 # yapf: enable
@@ -257,6 +259,8 @@ def compress(args):
         'window_size': 10000,
         # The decay coefficient of moving average, default is 0.9
         'moving_rate': 0.9,
+        # Whether use onnx format or not
+        'onnx_format': args.onnx_format,
     }
 
     # 2. quantization transform programs (training aware)
@@ -298,9 +302,9 @@ def compress(args):
         places,
         quant_config,
         scope=None,
-        act_preprocess_func=act_preprocess_func,
-        optimizer_func=optimizer_func,
-        executor=executor,
+        act_preprocess_func=None,
+        optimizer_func=None,
+        executor=None,
         for_test=True)
     compiled_train_prog = quant_aware(
         train_prog,
@@ -425,29 +429,23 @@ def compress(args):
     # 3. Freeze the graph after training by adjusting the quantize
     #    operators' order for the inference.
     #    The dtype of float_program's weights is float32, but in int8 range.
-    float_program, int8_program = convert(val_program, places, quant_config, \
-                                                        scope=None, \
-                                                        save_int8=True)
+    model_path = os.path.join(quantization_model_save_dir, args.model)
+    if not os.path.isdir(model_path):
+        os.makedirs(model_path)
+    float_program = convert(val_program, places, quant_config)
     _logger.info("eval best_model after convert")
     final_acc1 = test(best_epoch, float_program)
     _logger.info("final acc:{}".format(final_acc1))
 
     # 4. Save inference model
-    model_path = os.path.join(quantization_model_save_dir, args.model,
-                              'act_' + quant_config['activation_quantize_type']
-                              + '_w_' + quant_config['weight_quantize_type'])
-    float_path = os.path.join(model_path, 'float')
-    if not os.path.isdir(model_path):
-        os.makedirs(model_path)
-
     paddle.fluid.io.save_inference_model(
-        dirname=float_path,
+        dirname=model_path,
         feeded_var_names=[image.name],
         target_vars=[out],
         executor=exe,
         main_program=float_program,
-        model_filename=float_path + '/model',
-        params_filename=float_path + '/params')
+        model_filename=model_path + '/model.pdmodel',
+        params_filename=model_path + '/model.pdiparams')
 
 
 def main():
diff --git a/demo/quant/quant_aware/train.py b/demo/quant/quant_aware/train.py
index abf6073ec7bce5f870a5b3c1d0ca545351791833..7fc133a465f6e1f90cee34848a024897faeaf85d 100644
--- a/demo/quant/quant_aware/train.py
+++ b/demo/quant/quant_aware/train.py
@@ -126,6 +126,8 @@ def compress(args):
         'window_size': 10000,
         # The decay coefficient of moving average, default is 0.9
         'moving_rate': 0.9,
+        # Whether use onnx format or not
+        'onnx_format': args.onnx_format,
     }
 
     pretrain = True
@@ -294,10 +296,7 @@ def compress(args):
     #    operators' order for the inference.
     #    The dtype of float_program's weights is float32, but in int8 range.
     ############################################################################################################
-    float_program, int8_program = convert(val_program, places, quant_config, \
-                                                        scope=None, \
-                                                        save_int8=True,
-                                                        onnx_format=args.onnx_format)
+    float_program = convert(val_program, places, quant_config)
     print("eval best_model after convert")
     final_acc1 = test(best_epoch, float_program)
     ############################################################################################################
diff --git a/example/auto_compression/pytorch_yolov5/configs/yolov5s_qat_dis.yaml b/example/auto_compression/pytorch_yolov5/configs/yolov5s_qat_dis.yaml
index ef9bf8b7cbfcfbca983af4f7ecc7a23ce6109af4..0f932a9e61641a9b609c77bd6f435faa31557e79 100644
--- a/example/auto_compression/pytorch_yolov5/configs/yolov5s_qat_dis.yaml
+++ b/example/auto_compression/pytorch_yolov5/configs/yolov5s_qat_dis.yaml
@@ -14,6 +14,7 @@ Distillation:
 
 Quantization:
   use_pact: true
+  onnx_format: False
   activation_quantize_type: 'moving_average_abs_max'
   quantize_op_types:
   - conv2d
diff --git a/paddleslim/auto_compression/compressor.py b/paddleslim/auto_compression/compressor.py
index af16e6e910ff4407596a14c5251d6d32fe2be322..5587e92d4ab6bba057251fdf3389a5ea13025309 100644
--- a/paddleslim/auto_compression/compressor.py
+++ b/paddleslim/auto_compression/compressor.py
@@ -787,15 +787,18 @@ class AutoCompression:
             os.remove(os.path.join(self.tmp_dir, 'best_model.pdopt'))
             os.remove(os.path.join(self.tmp_dir, 'best_model.pdparams'))
 
-        if 'qat' in strategy:
-            test_program, int8_program = convert(test_program, self._places, self._quant_config, \
-                                          scope=paddle.static.global_scope(), \
-                                          save_int8=True)
-
         model_dir = os.path.join(self.tmp_dir,
                                  'strategy_{}'.format(str(strategy_idx + 1)))
         if not os.path.exists(model_dir):
             os.makedirs(model_dir)
+
+        if 'qat' in strategy:
+            test_program = convert(
+                test_program,
+                self._places,
+                self._quant_config,
+                scope=paddle.static.global_scope())
+
         feed_vars = [
             test_program.global_block().var(name)
             for name in test_program_info.feed_target_names
diff --git a/paddleslim/auto_compression/strategy_config.py b/paddleslim/auto_compression/strategy_config.py
index 5226a7c843cc1d81d0bd4b3339b3e140a801b3b2..aad5e23e1a5d8dae0c39d9f392a885db1d64696a 100644
--- a/paddleslim/auto_compression/strategy_config.py
+++ b/paddleslim/auto_compression/strategy_config.py
@@ -65,6 +65,7 @@ class Quantization(BaseStrategy):
                  window_size=10000,
                  moving_rate=0.9,
                  for_tensorrt=False,
+                 onnx_format=False,
                  is_full_quantize=False):
         """
         Quantization Config.
@@ -80,6 +81,7 @@ class Quantization(BaseStrategy):
             window_size(int): Window size for 'range_abs_max' quantization. Default: 10000.
             moving_rate(float): The decay coefficient of moving average. Default: 0.9.
             for_tensorrt(bool): If True, 'quantize_op_types' will be TENSORRT_OP_TYPES. Default: False.
+            onnx_format(bool): Whether to export the quantized model with format of ONNX. Default is False.
             is_full_quantize(bool): If True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES. Default: False.
         """
         super(Quantization, self).__init__("Quantization")
@@ -95,6 +97,7 @@ class Quantization(BaseStrategy):
         self.window_size = window_size
         self.moving_rate = moving_rate
         self.for_tensorrt = for_tensorrt
+        self.onnx_format = onnx_format
         self.is_full_quantize = is_full_quantize
 
 
diff --git a/paddleslim/quant/quanter.py b/paddleslim/quant/quanter.py
index 9e07c03c6ee5c1bb657393bfbc175d72ebd558fc..3ecff9bf31341d59b8f85ae9b087d0459b2ac8de 100755
--- a/paddleslim/quant/quanter.py
+++ b/paddleslim/quant/quanter.py
@@ -91,7 +91,9 @@ _quant_config_default = {
     # if True, 'quantize_op_types' will be TENSORRT_OP_TYPES
     'for_tensorrt': False,
     # if True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES 
-    'is_full_quantize': False
+    'is_full_quantize': False,
+    # if True, use onnx format to quant.
+    'onnx_format': False,
 }
 
 
@@ -222,7 +224,6 @@ def quant_aware(program,
                 act_preprocess_func=None,
                 optimizer_func=None,
                 executor=None,
-                onnx_format=False,
                 return_program=False,
                 draw_graph=False):
     """Add quantization  and dequantization operators to "program" 
@@ -236,7 +237,9 @@ def quant_aware(program,
             Default: None.
         scope(paddle.static.Scope): Scope records the mapping between variable names and variables, 
             similar to brackets in programming languages. Usually users can use 
-            `paddle.static.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.              When ``None`` will use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``.
+            `paddle.static.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.
+            When ``None`` will use `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ .
+            Default: ``None``.
         for_test(bool): If the 'program' parameter is a test program, this parameter should be set to ``True``. 
             Otherwise, set to ``False``.Default: False
        weight_quantize_func(function): Function that defines how to quantize weight. Using this
@@ -291,7 +294,8 @@ def quant_aware(program,
         elif op_type in QUANT_DEQUANT_PASS_OP_TYPES:
             quant_dequant_ops.append(op_type)
     if len(transform_pass_ops) > 0:
-        trannsform_func = 'QuantizationTransformPassV2' if onnx_format else 'QuantizationTransformPass'
+        trannsform_func = 'QuantizationTransformPassV2' if config[
+            'onnx_format'] else 'QuantizationTransformPass'
         transform_pass = eval(trannsform_func)(
             scope=scope,
             place=place,
@@ -313,7 +317,8 @@ def quant_aware(program,
         transform_pass.apply(main_graph)
 
     if len(quant_dequant_ops) > 0:
-        qdq_func = 'AddQuantDequantPassV2' if onnx_format else 'AddQuantDequantPass'
+        qdq_func = 'AddQuantDequantPassV2' if config[
+            'onnx_format'] else 'AddQuantDequantPass'
         quant_dequant_pass = eval(qdq_func)(
             scope=scope,
             place=place,
@@ -516,12 +521,7 @@ def quant_post_static(
 quant_post = quant_post_static
 
 
-def convert(program,
-            place,
-            config=None,
-            scope=None,
-            save_int8=False,
-            onnx_format=False):
+def convert(program, place, config=None, scope=None, save_int8=False):
     """
     convert quantized and well-trained ``program`` to final  quantized
     ``program``that can be used to  save ``inference model``.
@@ -560,7 +560,7 @@ def convert(program,
     _logger.info("convert config {}".format(config))
     test_graph = IrGraph(core.Graph(program.desc), for_test=True)
 
-    if onnx_format:
+    if config['onnx_format']:
         quant_weight_pass = QuantWeightPass(scope, place)
         quant_weight_pass.apply(test_graph)
     else: