# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from pathlib import Path from abc import ABCMeta, abstractmethod from paddle.io import Dataset from .preprocess import build_preprocess IMG_EXTENSIONS = ('.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP') def scandir(dir_path, suffix=None, recursive=False): """Scan a directory to find the interested files. Args: dir_path (str | obj:`Path`): Path of the directory. suffix (str | tuple(str), optional): File suffix that we are interested in. Default: None. recursive (bool, optional): If set to True, recursively scan the directory. Default: False. Returns: A generator for all the interested files with relative pathes. """ if isinstance(dir_path, (str, Path)): dir_path = str(dir_path) else: raise TypeError('"dir_path" must be a string or Path object') if (suffix is not None) and not isinstance(suffix, (str, tuple)): raise TypeError('"suffix" must be a string or tuple of strings') root = dir_path def _scandir(dir_path, suffix, recursive): for entry in os.scandir(dir_path): if not entry.name.startswith('.') and entry.is_file(): rel_path = os.path.relpath(entry.path, root) if suffix is None: yield rel_path elif rel_path.endswith(suffix): yield rel_path else: if recursive: yield from _scandir(entry.path, suffix=suffix, recursive=recursive) else: continue return _scandir(dir_path, suffix=suffix, recursive=recursive) class BaseDataset(Dataset, metaclass=ABCMeta): """Base class for datasets. All datasets should subclass it. All subclasses should overwrite: ``prepare_data_infos``, supporting to load information and generate image lists. Args: preprocess (list[dict]): A sequence of data preprocess config. """ def __init__(self, preprocess=None): super(BaseDataset, self).__init__() if preprocess: self.preprocess = build_preprocess(preprocess) @abstractmethod def prepare_data_infos(self): """Abstract function for loading annotation. All subclasses should overwrite this function should set self.annotations in this fucntion data_infos should be as list of dict: [{key_path: file_path}, {key_path: file_path}, {key_path: file_path}] """ self.data_infos = None @staticmethod def scan_folder(path): """Obtain sample path list (including sub-folders) from a given folder. Args: path (str|pathlib.Path): Folder path. Returns: list[str]: sample list obtained form given folder. """ if isinstance(path, (str, Path)): path = str(path) else: raise TypeError("'path' must be a str or a Path object, " f'but received {type(path)}.') samples = list(scandir(path, suffix=IMG_EXTENSIONS, recursive=True)) samples = [os.path.join(path, v) for v in samples] assert samples, '{} has no valid image file.'.format(path) return samples def __getitem__(self, idx): datas = self.data_infos[idx] if hasattr(self, 'preprocess') and self.preprocess: datas = self.preprocess(datas) return datas def __len__(self): """Length of the dataset. Returns: int: Length of the dataset. """ return len(self.data_infos)