mtmct.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pptracking.python.mot.visualize import plot_tracking
import os
import re
import cv2
import gc
import numpy as np
try:
    from sklearn import preprocessing
    from sklearn.cluster import AgglomerativeClustering
except:
    print(
        'Warning: Unable to use MTMCT in PP-Human, please install sklearn, for example: `pip install sklearn`'
    )
    pass
import pandas as pd
from tqdm import tqdm
from functools import reduce
import warnings
warnings.filterwarnings("ignore")


def gen_restxt(output_dir_filename, map_tid, cid_tid_dict):
    pattern = re.compile(r'c(\d)_t(\d)')
    f_w = open(output_dir_filename, 'w')
    for key, res in cid_tid_dict.items():
        cid, tid = pattern.search(key).groups()
        cid = int(cid) + 1
        rects = res["rects"]
        frames = res["frames"]
        for idx, bbox in enumerate(rects):
            bbox[0][3:] -= bbox[0][1:3]
            fid = frames[idx] + 1
            rect = [max(int(x), 0) for x in bbox[0][1:]]
            if key in map_tid:
                new_tid = map_tid[key]
                f_w.write(
                    str(cid) + ' ' + str(new_tid) + ' ' + str(fid) + ' ' +
                    ' '.join(map(str, rect)) + '\n')
    print('gen_res: write file in {}'.format(output_dir_filename))
    f_w.close()


def get_mtmct_matching_results(pred_mtmct_file, secs_interval=0.5,
                               video_fps=20):
    res = np.loadtxt(pred_mtmct_file)  # 'cid, tid, fid, x1, y1, w, h, -1, -1'
    camera_ids = list(map(int, np.unique(res[:, 0])))

    res = res[:, :7]
    # each line in res: 'cid, tid, fid, x1, y1, w, h'

    camera_tids = []
    camera_results = dict()
    for c_id in camera_ids:
        camera_results[c_id] = res[res[:, 0] == c_id]
        tids = np.unique(camera_results[c_id][:, 1])
        tids = list(map(int, tids))
        camera_tids.append(tids)

    # select common tids throughout each video
    common_tids = reduce(np.intersect1d, camera_tids)

    # get mtmct matching results by cid_tid_fid_results[c_id][t_id][f_id]
    cid_tid_fid_results = dict()
    cid_tid_to_fids = dict()
    interval = int(secs_interval * video_fps)  # preferably less than 10
    for c_id in camera_ids:
        cid_tid_fid_results[c_id] = dict()
        cid_tid_to_fids[c_id] = dict()
        for t_id in common_tids:
            tid_mask = camera_results[c_id][:, 1] == t_id
            cid_tid_fid_results[c_id][t_id] = dict()

            camera_trackid_results = camera_results[c_id][tid_mask]
            fids = np.unique(camera_trackid_results[:, 2])
            fids = fids[fids % interval == 0]
            fids = list(map(int, fids))
            cid_tid_to_fids[c_id][t_id] = fids

            for f_id in fids:
                st_frame = f_id
                ed_frame = f_id + interval

                st_mask = camera_trackid_results[:, 2] >= st_frame
                ed_mask = camera_trackid_results[:, 2] < ed_frame
                frame_mask = np.logical_and(st_mask, ed_mask)
                cid_tid_fid_results[c_id][t_id][f_id] = camera_trackid_results[
                    frame_mask]

    return camera_results, cid_tid_fid_results


def save_mtmct_vis_results(camera_results, captures, output_dir):
    # camera_results: 'cid, tid, fid, x1, y1, w, h'
    camera_ids = list(camera_results.keys())

    import shutil
    save_dir = os.path.join(output_dir, 'mtmct_vis')
    if os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    for idx, video_file in enumerate(captures):
        capture = cv2.VideoCapture(video_file)
        cid = camera_ids[idx]
        basename = os.path.basename(video_file)
        video_out_name = "vis_" + basename
        out_path = os.path.join(save_dir, video_out_name)
        print("Start visualizing output video: {}".format(out_path))

        # Get Video info : resolution, fps, frame count
        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(capture.get(cv2.CAP_PROP_FPS))
        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
        frame_id = 0
        while (1):
            if frame_id % 50 == 0:
                print('frame id: ', frame_id)
            ret, frame = capture.read()
            frame_id += 1
            if not ret:
                if frame_id == 1:
                    print("video read failed!")
                break
            frame_results = camera_results[cid][camera_results[cid][:, 2] ==
                                                frame_id]
            boxes = frame_results[:, -4:]
            ids = frame_results[:, 1]
            image = plot_tracking(frame, boxes, ids, frame_id=frame_id, fps=fps)
            writer.write(image)
        writer.release()


def get_euclidean(x, y, **kwargs):
    m = x.shape[0]
    n = y.shape[0]
    distmat = (np.power(x, 2).sum(axis=1, keepdims=True).repeat(
        n, axis=1) + np.power(y, 2).sum(axis=1, keepdims=True).repeat(
            m, axis=1).T)
    distmat -= np.dot(2 * x, y.T)
    return distmat


def cosine_similarity(x, y, eps=1e-12):
    """
    Computes cosine similarity between two tensors.
    Value == 1 means the same vector
    Value == 0 means perpendicular vectors
    """
    x_n, y_n = np.linalg.norm(
        x, axis=1, keepdims=True), np.linalg.norm(
            y, axis=1, keepdims=True)
    x_norm = x / np.maximum(x_n, eps * np.ones_like(x_n))
    y_norm = y / np.maximum(y_n, eps * np.ones_like(y_n))
    sim_mt = np.dot(x_norm, y_norm.T)
    return sim_mt


def get_cosine(x, y, eps=1e-12):
    """
    Computes cosine distance between two tensors.
    The cosine distance is the inverse cosine similarity
    -> cosine_distance = abs(-cosine_distance) to make it
    similar in behaviour to euclidean distance
    """
    sim_mt = cosine_similarity(x, y, eps)
    return sim_mt


def get_dist_mat(x, y, func_name="euclidean"):
    if func_name == "cosine":
        dist_mat = get_cosine(x, y)
    elif func_name == "euclidean":
        dist_mat = get_euclidean(x, y)
    print("Using {} as distance function during evaluation".format(func_name))
    return dist_mat


def intracam_ignore(st_mask, cid_tids):
    count = len(cid_tids)
    for i in range(count):
        for j in range(count):
            if cid_tids[i][1] == cid_tids[j][1]:
                st_mask[i, j] = 0.
    return st_mask


def get_sim_matrix_new(cid_tid_dict, cid_tids):
    # Note: camera independent get_sim_matrix function,
    # which is different from the one in camera_utils.py.
    count = len(cid_tids)

    q_arr = np.array(
        [cid_tid_dict[cid_tids[i]]['mean_feat'] for i in range(count)])
    g_arr = np.array(
        [cid_tid_dict[cid_tids[i]]['mean_feat'] for i in range(count)])
    #compute distmat
    distmat = get_dist_mat(q_arr, g_arr, func_name="cosine")

    #mask the element which belongs to same video
    st_mask = np.ones((count, count), dtype=np.float32)
    st_mask = intracam_ignore(st_mask, cid_tids)

    sim_matrix = distmat * st_mask
    np.fill_diagonal(sim_matrix, 0.)
    return 1. - sim_matrix


def get_match(cluster_labels):
    cluster_dict = dict()
    cluster = list()
    for i, l in enumerate(cluster_labels):
        if l in list(cluster_dict.keys()):
            cluster_dict[l].append(i)
        else:
            cluster_dict[l] = [i]
    for idx in cluster_dict:
        cluster.append(cluster_dict[idx])
    return cluster


def get_cid_tid(cluster_labels, cid_tids):
    cluster = list()
    for labels in cluster_labels:
        cid_tid_list = list()
        for label in labels:
            cid_tid_list.append(cid_tids[label])
        cluster.append(cid_tid_list)
    return cluster


def get_labels(cid_tid_dict, cid_tids):
    #compute cost matrix between features
    cost_matrix = get_sim_matrix_new(cid_tid_dict, cid_tids)

    #cluster all the features
    cluster1 = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=0.5,
        affinity='precomputed',
        linkage='complete')
    cluster_labels1 = cluster1.fit_predict(cost_matrix)
    labels = get_match(cluster_labels1)

    sub_cluster = get_cid_tid(labels, cid_tids)
    return labels


def sub_cluster(cid_tid_dict):
    '''
    cid_tid_dict: all camera_id and track_id
    '''
    #get all keys
    cid_tids = sorted([key for key in cid_tid_dict.keys()])

    #cluster all trackid
    clu = get_labels(cid_tid_dict, cid_tids)

    #relabel every cluster groups
    new_clu = list()
    for c_list in clu:
        new_clu.append([cid_tids[c] for c in c_list])
    cid_tid_label = dict()
    for i, c_list in enumerate(new_clu):
        for c in c_list:
            cid_tid_label[c] = i + 1
    return cid_tid_label


def distill_idfeat(mot_res):
    qualities_list = mot_res["qualities"]
    feature_list = mot_res["features"]
    rects = mot_res["rects"]

    qualities_new = []
    feature_new = []
    #filter rect less than 100*20
    for idx, rect in enumerate(rects):
        conf, xmin, ymin, xmax, ymax = rect[0]
        if (xmax - xmin) * (ymax - ymin) and (xmax > xmin) > 2000:
            qualities_new.append(qualities_list[idx])
            feature_new.append(feature_list[idx])
    #take all features if available rect is less than 2
    if len(qualities_new) < 2:
        qualities_new = qualities_list
        feature_new = feature_list

    #if available frames number is more than 200, take one frame data per 20 frames
    skipf = 1
    if len(qualities_new) > 20:
        skipf = 2
    quality_skip = np.array(qualities_new[::skipf])
    feature_skip = np.array(feature_new[::skipf])

    #sort features with image qualities, take the most trustworth features
    topk_argq = np.argsort(quality_skip)[::-1]
    if (quality_skip > 0.6).sum() > 1:
        topk_feat = feature_skip[topk_argq[quality_skip > 0.6]]
    else:
        topk_feat = feature_skip[topk_argq]

    #get final features by mean or cluster, at most take five
    mean_feat = np.mean(topk_feat[:5], axis=0)
    return mean_feat


def res2dict(multi_res):
    cid_tid_dict = {}
    for cid, c_res in enumerate(multi_res):
        for tid, res in c_res.items():
            key = "c" + str(cid) + "_t" + str(tid)
            if key not in cid_tid_dict:
                if len(res["rects"]) < 10:
                    continue
                cid_tid_dict[key] = res
                cid_tid_dict[key]['mean_feat'] = distill_idfeat(res)
    return cid_tid_dict


def mtmct_process(multi_res, captures, mtmct_vis=True, output_dir="output"):
    cid_tid_dict = res2dict(multi_res)
    if len(cid_tid_dict) == 0:
        print("no tracking result found, mtmct will be skiped.")
        return
    map_tid = sub_cluster(cid_tid_dict)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    pred_mtmct_file = os.path.join(output_dir, 'mtmct_result.txt')
    gen_restxt(pred_mtmct_file, map_tid, cid_tid_dict)

    if mtmct_vis:
        camera_results, cid_tid_fid_res = get_mtmct_matching_results(
            pred_mtmct_file)

        save_mtmct_vis_results(camera_results, captures, output_dir=output_dir)