[LatencyPredictor] Add new op and speed up prediction (#1014)

* add new op type and support fp16 model * preload predictors' model and speed up prediction * preload predictors' model * preload predictors' model * Modified the save path of TMP files Co-authored-by: N minghaoBD <79566150+minghaoBD@users.noreply.github.com>

[LatencyPredictor] Add new op and speed up prediction (#1014)
* add new op type and support fp16 model * preload predictors' model and speed up prediction * preload predictors' model * preload predictors' model * Modified the save path of TMP files Co-authored-by: N minghaoBD <79566150+minghaoBD@users.noreply.github.com>
1a9376a8 · zhouzj · GitHub · 23cc74de · 1a9376a8 · 1a9376a8
4 changed file
--- a/paddleslim/analysis/_utils.py
+++ b/paddleslim/analysis/_utils.py
@@ -18,7 +18,7 @@ import pickle
 import paddle
 import paddleslim
 import subprocess
-import sklearn
+import time
 __all__ = [
    "save_cls_model", "save_det_model", "save_seg_model", "nearest_interpolate",
    "opt_model", "load_predictor"
@@ -29,10 +29,11 @@ def opt_model(opt="paddle_lite_opt",
              model_file='',
              param_file='',
              optimize_out_type='protobuf',
-              valid_targets='arm'):
+              valid_targets='arm',
+              enable_fp16=False):
    assert os.path.exists(model_file) and os.path.exists(
        param_file), f'{model_file} or {param_file} does not exist.'
-    save_dir = f'./opt_models_tmp/{os.getpid()}'
+    save_dir = f'./opt_models_tmp/{os.getpid()}_{time.time()}'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

@@ -41,8 +42,8 @@ def opt_model(opt="paddle_lite_opt",
        model_out = os.path.join(save_dir, 'pbmodel')
    else:
        model_out = os.path.join(save_dir, 'model')
-
-    cmd = f'{opt} --model_file={model_file} --param_file={param_file}  --optimize_out_type={optimize_out_type} --optimize_out={model_out} --valid_targets={valid_targets}'
+    enable_fp16 = str(enable_fp16).lower()
+    cmd = f'{opt} --model_file={model_file} --param_file={param_file}  --optimize_out_type={optimize_out_type} --optimize_out={model_out} --valid_targets={valid_targets} --enable_fp16={enable_fp16}'
    print(f'commands:{cmd}')
    m = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

--- a/paddleslim/analysis/extract_features.py
+++ b/paddleslim/analysis/extract_features.py
@@ -52,8 +52,13 @@ def get_features_from_paramkey(param_key, op_type, data_type):
    features = None

    if 'conv2d' in op_type:
-        flag_quant = 'quant=None' if data_type == 'fp32' else 'quant=True'
-        if flag_quant not in param_key:
+        if data_type == 'fp16':
+            quant_bits = 'bit_length=16'
+        elif data_type == 'int8':
+            quant_bits = 'bit_length=8'
+        else:
+            quant_bits = 'bit_length=None'
+        if quant_bits not in param_key:
            return None

        weight = re.search(r'weight=(\(\d*, \d*, \d*, \d*\))',
@@ -178,7 +183,7 @@ def get_features_from_paramkey(param_key, op_type, data_type):
          'leaky_relu' in op_type or 'tanh' in op_type or 'swish' in op_type or
          'softmax' in op_type or 'hard_sigmoid' in op_type or
          'sigmoid' in op_type or 'gelu' in op_type or 'clip' in op_type or
-          'shape' in op_type or 'interp_v2' in op_type):
+          'shape' in op_type or 'interp_v2' in op_type or 'sqrt' in op_type):

        inputs = re.search(r'in=(\((-?\d+,* *)+\))',
                           param_key).group().split('=')[-1].strip(

--- a/paddleslim/analysis/latency_predictor.py
+++ b/paddleslim/analysis/latency_predictor.py
@@ -16,7 +16,7 @@

 import os
 import pickle
-import time
+import shutil
 import subprocess
 from .parse_ops import get_key_from_op
 from .extract_features import get_data_from_tables, get_features_from_paramkey
@@ -71,15 +71,16 @@ class TableLatencyPredictor(LatencyPredictor):
        self.hardware = None
        self.threads = None
        self.predictor_state = False
+        self.predictor = {}
        self._initial_table()

    def _initial_table(self):
        if self.table_file in ['SD625', 'SD710', 'SD845', 'SD865']:
            self.hardware = self.table_file
-            if self.hardware in ['SD625', 'SD710']:
-                self.predictor_state = True
            self.threads = 4
            self.table_file = f'{self.hardware}_threads_4_power_mode_0.pkl'
+            if self.hardware in ['SD625', 'SD710']:
+                self.predictor_state = True
            if not os.path.exists(self.table_file):
                subprocess.call(
                    f'wget https://paddlemodels.bj.bcebos.com/PaddleSlim/analysis/{self.table_file}',
@@ -115,6 +116,19 @@ class TableLatencyPredictor(LatencyPredictor):
                break
        return in_shape

+    def _preload_predictor(self, data_type='fp32'):
+        op_types = [
+            'depthwise_conv2d', 'conv2d', 'pool2d', 'matmul', 'elementwise_add',
+            'elementwise_mul', 'concat', 'calib', 'swish'
+        ]
+        op_dir = self.table_file.split('.')[0] + '_batchsize_1'
+        for op_type in op_types:
+            model = load_predictor(op_type, op_dir, data_type)
+            key = op_type
+            if 'conv2d' in op_type:
+                key = f'{op_type}_{data_type}'
+            self.predictor[key] = model
+
    def predict(self,
                model_file,
                param_file,
@@ -125,22 +139,27 @@ class TableLatencyPredictor(LatencyPredictor):
        
        Args:
            model_file(str), param_file(str): The inference model(*.pdmodel, *.pdiparams).
-            data_type(str): Data type, fp32 or int8. Default : fp32
+            data_type(str): Data type, fp32, fp16 or int8.
            threads(int): threads num
            input_shape(list): Generally, the input shape is confirmed when saving the inference model and the parameter is only effective for input shape that has variable length.
        Returns:
            latency(float): The latency of the model.
        """
-        assert data_type in ['fp32', 'int8'
-                             ], f'data_type must be one of [fp32, int8]'
+        assert data_type in ['fp32', 'int8', 'fp16'
+                             ], f'data_type must be one of [fp32, int8, fp16]'

        if self.hardware and self.threads != threads:
            self._change_table(threads)

+        if self.predictor_state and f'conv2d_{data_type}' not in self.predictor:
+            self._preload_predictor(data_type)
+
+        enable_fp16 = True if data_type == 'fp16' else False
        pbmodel_file = opt_model(
            model_file=model_file,
            param_file=param_file,
-            optimize_out_type='protobuf', )
+            optimize_out_type='protobuf',
+            enable_fp16=enable_fp16)

        paddle.enable_static()
        with open(pbmodel_file, "rb") as f:
@@ -176,7 +195,7 @@ class TableLatencyPredictor(LatencyPredictor):
            warnings.warn("OperatorType\tCalledTimes")
            for key in new_op:
                warnings.warn(f"{key.ljust(15)}\t{new_op[key]}")
-
+        shutil.rmtree(os.path.dirname(pbmodel_file))
        return latency

    def op_predictor(self, op_type, param_key, data_type):
@@ -185,18 +204,20 @@ class TableLatencyPredictor(LatencyPredictor):
        Args:
            op_type: The operator's type
            param_key: The operator's parameter information.
-            data_type: Data type, fp32 or int8. Default : int8
+            data_type: Data type, fp32 or int8.
        Returns:
            latency(float): The latency of the operator.
        """

        latency = 0.0
-        op_dir = self.table_file.split('.')[0] + '_batchsize_1'
        if op_type in [
                'depthwise_conv2d', 'conv2d', 'pool2d', 'matmul',
                'elementwise_add', 'elementwise_mul', 'concat', 'calib', 'swish'
        ]:
-            predictor = load_predictor(op_type, op_dir, data_type)
+            key = op_type
+            if 'conv2d' in op_type:
+                key = f'{op_type}_{data_type}'
+            predictor = self.predictor[key]
            features = get_features_from_paramkey(param_key, op_type, data_type)
            latency = predictor.predict([features])
        else:

--- a/paddleslim/analysis/parse_ops.py
+++ b/paddleslim/analysis/parse_ops.py
@@ -24,25 +24,30 @@ def get_key_from_op(op):
    if 'conv2d' in op_type:
        out_shape = op.all_outputs()[0].shape()
        in_shape = op.all_inputs()[-1].shape()
+        in_name = op.all_inputs()[1].name()
        weight_shape = op.all_inputs()[-2].shape()
-        kernel = weight_shape[2]
+        weight_shape = (out_shape[1], weight_shape[1], weight_shape[2], weight_shape[3])
+        
        stride = op.attr('strides')[1]
        padding = op.attr('paddings')[1]
        groups = op.attr('groups')
        dilation = op.attr('dilations')[1]
-        int8 = op.attr('enable_int8')
+        quant = op.attr('enable_int8')
        bit_length = op.attr('bit_length')
+        if op.attr(in_name+'_fp16') == 'fp16':
+            quant = True
+            bit_length = 16   

-        param_key = f'{op_type} in={in_shape} weight={weight_shape} out={out_shape} pad={padding} stride={stride} group={groups} dilation={dilation} quant={int8} bit_length={bit_length}'
+        param_key = f'{op_type} in={in_shape} weight={weight_shape} out={out_shape} pad={padding} stride={stride} group={groups} dilation={dilation} quant={quant} bit_length={bit_length}'

    elif op_type == 'matmul' or op_type == 'matmul_v2':
        X = op.all_inputs()[0].shape()
        Y = op.all_inputs()[1].shape()
        out_shape = op.all_outputs()[0].shape()
-        int8 = op.attr('enable_int8')
+        quant = op.attr('enable_int8')
        bit_length = op.attr('bit_length')

-        param_key = f'{op_type} X={X} Y={Y} out={out_shape} quant={int8} bit_length={bit_length}'
+        param_key = f'{op_type} X={X} Y={Y} out={out_shape} quant={quant} bit_length={bit_length}'

    elif 'batch_norm' in op_type or 'layer_norm' in op_type:
        out_shape = op.all_outputs()[-1].shape()
@@ -67,14 +72,12 @@ def get_key_from_op(op):

    elif op_type in [
            'hard_swish', 'relu', 'leaky_relu', 'tanh', 'swish', 'softmax',
-            'hard_sigmoid', 'sigmoid', 'gelu', 'clip', 'shape'
+            'hard_sigmoid', 'sigmoid', 'gelu', 'clip', 'shape', 'sqrt'
    ] or 'transpose' in op_type or 'interp_v2' in op_type:
        in_shape = op.all_inputs()[-1].shape()
+        out_shape = op.all_outputs()[0].shape()

-        param_key = f'{op_type} in={in_shape}'
-        in_shape = op.all_inputs()[-1].shape()
-
-        param_key = f'{op_type} in={in_shape}'
+        param_key = f'{op_type} in={in_shape} out={out_shape}'

    elif op_type in ['fill_constant', 'range', 'cast'] or 'expand' in op_type: