video_action_infer.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import yaml
import glob

import cv2
import numpy as np
import math
import paddle
import sys
from collections import Sequence
import paddle.nn.functional as F

# add deploy path of PadleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
sys.path.insert(0, parent_path)

from paddle.inference import Config, create_predictor
from utils import argsparser, Timer, get_current_memory_mb
from benchmark_utils import PaddleInferBenchmark
from infer import Detector, print_arguments
from video_action_preprocess import VideoDecoder, Sampler, Scale, CenterCrop, Normalization, Image2Array


def softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x


class VideoActionRecognizer(object):
    """
    Args:
        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
        batch_size (int): size of pre batch in inference
        trt_min_shape (int): min shape for dynamic shape in trt
        trt_max_shape (int): max shape for dynamic shape in trt
        trt_opt_shape (int): opt shape for dynamic shape in trt
        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
            calibration, trt_calib_mode need to set True
        cpu_threads (int): cpu threads
        enable_mkldnn (bool): whether to open MKLDNN
    """

    def __init__(self,
                 model_dir,
                 device='CPU',
                 run_mode='paddle',
                 num_seg=8,
                 seg_len=1,
                 short_size=256,
                 target_size=224,
                 top_k=1,
                 batch_size=1,
                 trt_min_shape=1,
                 trt_max_shape=1280,
                 trt_opt_shape=640,
                 trt_calib_mode=False,
                 cpu_threads=1,
                 enable_mkldnn=False,
                 ir_optim=True):

        self.num_seg = num_seg
        self.seg_len = seg_len
        self.short_size = short_size
        self.target_size = target_size
        self.top_k = top_k

        assert batch_size == 1, "VideoActionRecognizer only support batch_size=1 now."

        self.model_dir = model_dir
        self.device = device
        self.run_mode = run_mode
        self.batch_size = batch_size
        self.trt_min_shape = trt_min_shape
        self.trt_max_shape = trt_max_shape
        self.trt_opt_shape = trt_opt_shape
        self.trt_calib_mode = trt_calib_mode
        self.cpu_threads = cpu_threads
        self.enable_mkldnn = enable_mkldnn
        self.ir_optim = ir_optim

        self.recognize_times = Timer()

        model_file_path = os.path.join(model_dir, "model.pdmodel")
        params_file_path = os.path.join(model_dir, "model.pdiparams")
        self.config = Config(model_file_path, params_file_path)

        if device == "GPU" or device == "gpu":
            self.config.enable_use_gpu(8000, 0)
        else:
            self.config.disable_gpu()
        if self.enable_mkldnn:
            # cache 10 different shapes for mkldnn to avoid memory leak
            self.config.set_mkldnn_cache_capacity(10)
            self.config.enable_mkldnn()

        self.config.switch_ir_optim(self.ir_optim)  # default true

        precision_map = {
            'trt_int8': Config.Precision.Int8,
            'trt_fp32': Config.Precision.Float32,
            'trt_fp16': Config.Precision.Half
        }
        if run_mode in precision_map.keys():
            self.config.enable_tensorrt_engine(
                max_batch_size=self.batch_size,
                precision_mode=precision_map[run_mode])

        self.config.enable_memory_optim()
        # use zero copy
        self.config.switch_use_feed_fetch_ops(False)

        self.predictor = create_predictor(self.config)

    def preprocess_batch(self, file_list):
        batched_inputs = []
        for file in file_list:
            inputs = self.preprocess(file)
            batched_inputs.append(inputs)
        batched_inputs = [
            np.concatenate([item[i] for item in batched_inputs])
            for i in range(len(batched_inputs[0]))
        ]
        self.input_file = file_list
        return batched_inputs

    def get_timer(self):
        return self.recognize_times

    def predict(self, input):
        '''
        Args:
            input (str) or (list): video file path or image data list
        Returns:
            results (dict): 
        '''

        input_names = self.predictor.get_input_names()
        input_tensor = self.predictor.get_input_handle(input_names[0])

        output_names = self.predictor.get_output_names()
        output_tensor = self.predictor.get_output_handle(output_names[0])

        # preprocess
        self.recognize_times.preprocess_time_s.start()
        if type(input) == str:
            inputs = self.preprocess_video(input)
        else:
            inputs = self.preprocess_frames(input)
        self.recognize_times.preprocess_time_s.end()

        inputs = np.expand_dims(
            inputs, axis=0).repeat(
                self.batch_size, axis=0).copy()

        input_tensor.copy_from_cpu(inputs)

        # model prediction
        self.recognize_times.inference_time_s.start()
        self.predictor.run()
        self.recognize_times.inference_time_s.end()

        output = output_tensor.copy_to_cpu()

        # postprocess
        self.recognize_times.postprocess_time_s.start()
        classes, scores = self.postprocess(output)
        self.recognize_times.postprocess_time_s.end()

        return classes, scores

    def preprocess_frames(self, frame_list):
        """
        frame_list: list, frame list
        return: list
        """

        results = {}
        results['frames_len'] = len(frame_list)
        results["imgs"] = frame_list

        img_mean = [0.485, 0.456, 0.406]
        img_std = [0.229, 0.224, 0.225]
        ops = [
            CenterCrop(self.target_size), Image2Array(),
            Normalization(img_mean, img_std)
        ]
        for op in ops:
            results = op(results)

        res = np.expand_dims(results['imgs'], axis=0).copy()
        return [res]

    def preprocess_video(self, input_file):
        """
        input_file: str, file path
        return: list
        """
        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
            input_file)

        results = {'filename': input_file}
        img_mean = [0.485, 0.456, 0.406]
        img_std = [0.229, 0.224, 0.225]
        ops = [
            VideoDecoder(), Sampler(
                self.num_seg, self.seg_len, valid_mode=True),
            Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
            Normalization(img_mean, img_std)
        ]
        for op in ops:
            results = op(results)

        res = np.expand_dims(results['imgs'], axis=0).copy()
        return [res]

    def postprocess(self, output):
        output = output.flatten()  # numpy.ndarray
        output = softmax(output)
        classes = np.argpartition(output, -self.top_k)[-self.top_k:]
        classes = classes[np.argsort(-output[classes])]
        scores = output[classes]
        return classes, scores


def main():
    if not FLAGS.run_benchmark:
        assert FLAGS.batch_size == 1
        assert FLAGS.use_fp16 is False
    else:
        assert FLAGS.use_gpu is True

    recognizer = VideoActionRecognizer(
        FLAGS.model_dir,
        short_size=FLAGS.short_size,
        target_size=FLAGS.target_size,
        device=FLAGS.device,
        run_mode=FLAGS.run_mode,
        batch_size=FLAGS.batch_size,
        trt_min_shape=FLAGS.trt_min_shape,
        trt_max_shape=FLAGS.trt_max_shape,
        trt_opt_shape=FLAGS.trt_opt_shape,
        trt_calib_mode=FLAGS.trt_calib_mode,
        cpu_threads=FLAGS.cpu_threads,
        enable_mkldnn=FLAGS.enable_mkldnn, )

    if not FLAGS.run_benchmark:
        classes, scores = recognizer.predict(FLAGS.video_file)
        print("Current video file: {}".format(FLAGS.video_file))
        print("\ttop-1 class: {0}".format(classes[0]))
        print("\ttop-1 score: {0}".format(scores[0]))
    else:
        cm, gm, gu = get_current_memory_mb()
        mems = {'cpu_rss_mb': cm, 'gpu_rss_mb': gm, 'gpu_util': gu * 100}

        perf_info = recognizer.recognize_times.report()
        model_dir = FLAGS.model_dir
        mode = FLAGS.run_mode
        model_info = {
            'model_name': model_dir.strip('/').split('/')[-1],
            'precision': mode.split('_')[-1]
        }
        data_info = {
            'batch_size': FLAGS.batch_size,
            'shape': "dynamic_shape",
            'data_num': perf_info['img_num']
        }
        recognize_log = PaddleInferBenchmark(recognizer.config, model_info,
                                             data_info, perf_info, mems)
        recognize_log('Fight')


if __name__ == '__main__':
    paddle.enable_static()
    parser = argsparser()
    FLAGS = parser.parse_args()
    print_arguments(FLAGS)
    FLAGS.device = FLAGS.device.upper()
    assert FLAGS.device in ['CPU', 'GPU', 'XPU'
                            ], "device should be CPU, GPU or XPU"

    main()