reader.py

#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import paddle
import numpy as np
import json
import logging
import os
import sys

sys.path.append('../')

from hapi.distributed import DistributedBatchSampler
from paddle.io import Dataset, DataLoader

logger = logging.getLogger(__name__)

from config_utils import *
from bmn_utils import iou_with_anchors, ioa_with_anchors

DATATYPE = "float32"


class BmnDataset(Dataset):
    def __init__(self, cfg, mode):
        self.mode = mode
        self.tscale = cfg.MODEL.tscale  # 100
        self.dscale = cfg.MODEL.dscale  # 100
        self.anno_file = cfg.MODEL.anno_file
        self.feat_path = cfg.MODEL.feat_path
        self.file_list = cfg.INFER.filelist
        self.subset = cfg[mode.upper()]['subset']
        self.tgap = 1. / self.tscale

        self.get_dataset_dict()
        self.get_match_map()

    def __getitem__(self, index):
        video_name = self.video_list[index]
        video_idx = np.array(self.video_list.index(video_name)).astype('int64')
        video_feat = self.load_file(video_name)
        if self.mode == 'infer':
            return video_feat, video_idx
        else:
            gt_iou_map, gt_start, gt_end = self.get_video_label(video_name)
            if self.mode == 'train' or self.mode == 'valid':
                return video_feat, gt_iou_map, gt_start, gt_end
            elif self.mode == 'test':
                return video_feat, gt_iou_map, gt_start, gt_end, video_idx

    def __len__(self):
        return len(self.video_list)

    def get_dataset_dict(self):
        assert (
            os.path.exists(self.feat_path)), "Input feature path not exists"
        assert (os.listdir(self.feat_path)), "No feature file  in feature path"
        self.video_dict = {}
        if self.mode == "infer":
            annos = json.load(open(self.file_list))
            for video_name in annos.keys():
                self.video_dict[video_name] = annos[video_name]
        else:
            annos = json.load(open(self.anno_file))
            for video_name in annos.keys():
                video_subset = annos[video_name]["subset"]
                if self.subset in video_subset:
                    self.video_dict[video_name] = annos[video_name]
        self.video_list = list(self.video_dict.keys())
        self.video_list.sort()
        print("%s subset video numbers: %d" %
              (self.subset, len(self.video_list)))
        video_name_set = set(
            [video_name + '.npy' for video_name in self.video_list])
        assert (video_name_set.intersection(set(os.listdir(self.feat_path))) ==
                video_name_set), "Input feature not exists in feature path"

    def get_match_map(self):
        match_map = []
        for idx in range(self.tscale):
            tmp_match_window = []
            xmin = self.tgap * idx
            for jdx in range(1, self.tscale + 1):
                xmax = xmin + self.tgap * jdx
                tmp_match_window.append([xmin, xmax])
            match_map.append(tmp_match_window)
        match_map = np.array(match_map)
        match_map = np.transpose(match_map, [1, 0, 2])
        match_map = np.reshape(match_map, [-1, 2])
        self.match_map = match_map
        self.anchor_xmin = [self.tgap * i for i in range(self.tscale)]
        self.anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]

    def get_video_label(self, video_name):
        video_info = self.video_dict[video_name]
        video_second = video_info['duration_second']
        video_labels = video_info['annotations']

        gt_bbox = []
        gt_iou_map = []
        for gt in video_labels:
            tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
            tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
            gt_bbox.append([tmp_start, tmp_end])
            tmp_gt_iou_map = iou_with_anchors(
                self.match_map[:, 0], self.match_map[:, 1], tmp_start, tmp_end)
            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
                                        [self.dscale, self.tscale])
            gt_iou_map.append(tmp_gt_iou_map)
        gt_iou_map = np.array(gt_iou_map)
        gt_iou_map = np.max(gt_iou_map, axis=0)

        gt_bbox = np.array(gt_bbox)
        gt_xmins = gt_bbox[:, 0]
        gt_xmaxs = gt_bbox[:, 1]
        gt_len_small = 3 * self.tgap
        gt_start_bboxs = np.stack(
            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
        gt_end_bboxs = np.stack(
            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)

        match_score_start = []
        for jdx in range(len(self.anchor_xmin)):
            match_score_start.append(
                np.max(
                    ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[
                        jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])))
        match_score_end = []
        for jdx in range(len(self.anchor_xmin)):
            match_score_end.append(
                np.max(
                    ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[
                        jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))

        gt_start = np.array(match_score_start)
        gt_end = np.array(match_score_end)
        return gt_iou_map.astype(DATATYPE), gt_start.astype(
            DATATYPE), gt_end.astype(DATATYPE)

    def load_file(self, video_name):
        file_name = video_name + ".npy"
        file_path = os.path.join(self.feat_path, file_name)
        video_feat = np.load(file_path)
        video_feat = video_feat.T
        video_feat = video_feat.astype("float32")
        return video_feat