Support any name for a manifest (#4041)

* Initial version * Fix support 2 versions && fix case for cloud storages * Fix eslint errors * tmp * Fix manifest validation when data hasn't been copied yet * fix * Update changelog

Support any name for a manifest (#4041)
* Initial version * Fix support 2 versions && fix case for cloud storages * Fix eslint errors * tmp * Fix manifest validation when data hasn't been copied yet * fix * Update changelog
5281e793 · Maria Khrustaleva · GitHub · f74b6f0d · 5281e793 · 5281e793
6 changed file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add "tag" return type for automatic annotation in Nuclio (<https://github.com/openvinotoolkit/cvat/pull/3896>)
 - Dataset importing to a project (<https://github.com/openvinotoolkit/cvat/pull/3790>)
 - User is able to customize information that text labels show (<https://github.com/openvinotoolkit/cvat/pull/4029>)
+- Support for uploading manifest with any name (<https://github.com/openvinotoolkit/cvat/pull/4041>)

 ### Changed
 - TDB

--- a/cvat-ui/src/components/file-manager/file-manager.tsx
+++ b/cvat-ui/src/components/file-manager/file-manager.tsx
@@ -90,14 +90,13 @@ export class FileManager extends React.PureComponent<Props, State> {
        };
    }

-    private loadData = (key: string): Promise<void> =>
-        new Promise<void>((resolve, reject): void => {
-            const { onLoadData } = this.props;
+    private loadData = (key: string): Promise<void> => new Promise<void>((resolve, reject): void => {
+        const { onLoadData } = this.props;

-            const success = (): void => resolve();
-            const failure = (): void => reject();
-            onLoadData(key, success, failure);
-        });
+        const success = (): void => resolve();
+        const failure = (): void => reject();
+        onLoadData(key, success, failure);
+    });

    public reset(): void {
        const { active } = this.state;
@@ -161,8 +160,8 @@ export class FileManager extends React.PureComponent<Props, State> {
    private renderShareSelector(): JSX.Element {
        function renderTreeNodes(data: TreeNodeNormal[]): JSX.Element[] {
            // sort alphabetically
-            data.sort((a: TreeNodeNormal, b: TreeNodeNormal): number =>
-                a.key.toLocaleString().localeCompare(b.key.toLocaleString()));
+            data.sort((a: TreeNodeNormal, b: TreeNodeNormal): number => (
+                a.key.toLocaleString().localeCompare(b.key.toLocaleString())));
            return data.map((item: TreeNodeNormal) => {
                if (item.children) {
                    return (
@@ -205,8 +204,8 @@ export class FileManager extends React.PureComponent<Props, State> {
                                halfChecked: ReactText[];
                            },
                        ): void => {
-                            const keys = (checkedKeys as ReactText[]).map((text: ReactText): string =>
-                                text.toLocaleString());
+                            const keys = (checkedKeys as ReactText[]).map((text: ReactText): string => (
+                                text.toLocaleString()));
                            this.setState({
                                files: {
                                    ...files,
@@ -267,7 +266,7 @@ export class FileManager extends React.PureComponent<Props, State> {
                <CloudStorageTab
                    formRef={this.cloudStorageTabFormRef}
                    cloudStorage={cloudStorage}
-                    selectedFiles={files.cloudStorage.filter((item) => !item.endsWith('manifest.jsonl'))}
+                    selectedFiles={files.cloudStorage.filter((item) => !item.endsWith('.jsonl'))}
                    onSelectCloudStorage={(_cloudStorage: CloudStorage | null) => {
                        this.setState({ cloudStorage: _cloudStorage });
                    }}

--- a/cvat/apps/engine/task.py
+++ b/cvat/apps/engine/task.py
@@ -24,7 +24,7 @@ from cvat.apps.engine.log import slogger
 from cvat.apps.engine.media_extractors import (MEDIA_TYPES, Mpeg4ChunkWriter, Mpeg4CompressedChunkWriter,
    ValidateDimension, ZipChunkWriter, ZipCompressedChunkWriter, get_mime, sort)
 from cvat.apps.engine.utils import av_scan_paths
-from utils.dataset_manifest import ImageManifestManager, VideoManifestManager
+from utils.dataset_manifest import ImageManifestManager, VideoManifestManager, is_manifest
 from utils.dataset_manifest.core import VideoManifestValidator
 from utils.dataset_manifest.utils import detect_related_images
 from .cloud_provider import get_cloud_storage_instance, Credentials
@@ -113,7 +113,7 @@ def _save_task_to_db(db_task):
    db_task.data.save()
    db_task.save()

-def _count_files(data, manifest_file=None):
+def _count_files(data, manifest_files=None):
    share_root = settings.SHARE_ROOT
    server_files = []

@@ -143,8 +143,8 @@ def _count_files(data, manifest_file=None):
            mime = get_mime(full_path)
            if mime in counter:
                counter[mime].append(rel_path)
-            elif 'manifest.jsonl' == os.path.basename(rel_path):
-                manifest_file.append(rel_path)
+            elif rel_path.endswith('.jsonl'):
+                manifest_files.append(rel_path)
            else:
                slogger.glob.warn("Skip '{}' file (its mime type doesn't "
                    "correspond to supported MIME file type)".format(full_path))
@@ -163,7 +163,7 @@ def _count_files(data, manifest_file=None):

    return counter

-def _validate_data(counter, manifest_file=None):
+def _validate_data(counter, manifest_files=None):
    unique_entries = 0
    multiple_entries = 0
    for media_type, media_config in MEDIA_TYPES.items():
@@ -173,7 +173,7 @@ def _validate_data(counter, manifest_file=None):
            else:
                multiple_entries += len(counter[media_type])

-            if manifest_file and media_type not in ('video', 'image'):
+            if manifest_files and media_type not in ('video', 'image'):
                raise Exception('File with meta information can only be uploaded with video/images ')

    if unique_entries == 1 and multiple_entries > 0 or unique_entries > 1:
@@ -193,6 +193,16 @@ def _validate_data(counter, manifest_file=None):

    return counter, task_modes[0]

+def _validate_manifest(manifests, root_dir):
+    if manifests:
+        if len(manifests) != 1:
+            raise Exception('Only one manifest file can be attached with data')
+        full_manifest_path = os.path.join(root_dir, manifests[0])
+        if is_manifest(full_manifest_path):
+            return manifests[0]
+        raise Exception('Invalid manifest was uploaded')
+    return None
+
 def _download_data(urls, upload_dir):
    job = rq.get_current_job()
    local_files = {}
@@ -233,48 +243,57 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False):
    if data['remote_files'] and not isDatasetImport:
        data['remote_files'] = _download_data(data['remote_files'], upload_dir)

-    manifest_file = []
-    media = _count_files(data, manifest_file)
-    media, task_mode = _validate_data(media, manifest_file)
-    if manifest_file and (not settings.USE_CACHE or db_data.storage_method != models.StorageMethodChoice.CACHE):
-        raise Exception("File with meta information can be uploaded if 'Use cache' option is also selected")
+    manifest_files = []
+    media = _count_files(data, manifest_files)
+    media, task_mode = _validate_data(media, manifest_files)

    if data['server_files']:
        if db_data.storage == models.StorageChoice.LOCAL:
            _copy_data_from_source(data['server_files'], upload_dir, data.get('server_files_path'))
        elif db_data.storage == models.StorageChoice.SHARE:
            upload_dir = settings.SHARE_ROOT
-        else: # cloud storage
-            if not manifest_file: raise Exception('A manifest file not found')
-            db_cloud_storage = db_data.cloud_storage
-            credentials = Credentials()
-            credentials.convert_from_db({
-               'type': db_cloud_storage.credentials_type,
-               'value': db_cloud_storage.credentials,
-            })

-            details = {
-                'resource': db_cloud_storage.resource,
-                'credentials': credentials,
-                'specific_attributes': db_cloud_storage.get_specific_attributes()
-            }
-            cloud_storage_instance = get_cloud_storage_instance(cloud_provider=db_cloud_storage.provider_type, **details)
-            sorted_media = sort(media['image'], data['sorting_method'])
-            first_sorted_media_image = sorted_media[0]
-            cloud_storage_instance.download_file(first_sorted_media_image, os.path.join(upload_dir, first_sorted_media_image))
-
-            # prepare task manifest file from cloud storage manifest file
-            # NOTE we should create manifest before defining chunk_size
-            # FIXME in the future when will be implemented archive support
-            manifest = ImageManifestManager(db_data.get_manifest_path())
-            cloud_storage_manifest = ImageManifestManager(
-                os.path.join(db_data.cloud_storage.get_storage_dirname(), manifest_file[0]),
-                db_data.cloud_storage.get_storage_dirname()
-            )
-            cloud_storage_manifest.set_index()
-            sequence, content = cloud_storage_manifest.get_subset(sorted_media)
-            sorted_content = (i[1] for i in sorted(zip(sequence, content)))
-            manifest.create(sorted_content)
+    manifest_root = None
+    if db_data.storage in {models.StorageChoice.LOCAL, models.StorageChoice.SHARE}:
+        manifest_root = upload_dir
+    elif db_data.storage == models.StorageChoice.CLOUD_STORAGE:
+        manifest_root = db_data.cloud_storage.get_storage_dirname()
+
+    manifest_file = _validate_manifest(manifest_files, manifest_root)
+    if manifest_file and (not settings.USE_CACHE or db_data.storage_method != models.StorageMethodChoice.CACHE):
+        raise Exception("File with meta information can be uploaded if 'Use cache' option is also selected")
+
+    if data['server_files'] and db_data.storage == models.StorageChoice.CLOUD_STORAGE:
+        if not manifest_file: raise Exception('A manifest file not found')
+        db_cloud_storage = db_data.cloud_storage
+        credentials = Credentials()
+        credentials.convert_from_db({
+            'type': db_cloud_storage.credentials_type,
+            'value': db_cloud_storage.credentials,
+        })
+
+        details = {
+            'resource': db_cloud_storage.resource,
+            'credentials': credentials,
+            'specific_attributes': db_cloud_storage.get_specific_attributes()
+        }
+        cloud_storage_instance = get_cloud_storage_instance(cloud_provider=db_cloud_storage.provider_type, **details)
+        sorted_media = sort(media['image'], data['sorting_method'])
+        first_sorted_media_image = sorted_media[0]
+        cloud_storage_instance.download_file(first_sorted_media_image, os.path.join(upload_dir, first_sorted_media_image))
+
+        # prepare task manifest file from cloud storage manifest file
+        # NOTE we should create manifest before defining chunk_size
+        # FIXME in the future when will be implemented archive support
+        manifest = ImageManifestManager(db_data.get_manifest_path())
+        cloud_storage_manifest = ImageManifestManager(
+            os.path.join(db_data.cloud_storage.get_storage_dirname(), manifest_file),
+            db_data.cloud_storage.get_storage_dirname()
+        )
+        cloud_storage_manifest.set_index()
+        sequence, content = cloud_storage_manifest.get_subset(sorted_media)
+        sorted_content = (i[1] for i in sorted(zip(sequence, content)))
+        manifest.create(sorted_content)

    av_scan_paths(upload_dir)

@@ -432,12 +451,12 @@ def _create_thread(db_task, data, isBackupRestore=False, isDatasetImport=False):
            if not media_files:
                continue

-            # replace manifest file (e.g was uploaded 'subdir/manifest.jsonl')
+            # replace manifest file (e.g was uploaded 'subdir/manifest.jsonl' or 'some_manifest.jsonl')
            if manifest_file and not os.path.exists(db_data.get_manifest_path()):
-                shutil.copyfile(os.path.join(upload_dir, manifest_file[0]),
+                shutil.copyfile(os.path.join(upload_dir, manifest_file),
                    db_data.get_manifest_path())
                if upload_dir != settings.SHARE_ROOT:
-                    os.remove(os.path.join(upload_dir, manifest_file[0]))
+                    os.remove(os.path.join(upload_dir, manifest_file))

            if task_mode == MEDIA_TYPES['video']['mode']:
                try:

--- a/cvat/apps/engine/views.py
+++ b/cvat/apps/engine/views.py
@@ -1359,7 +1359,7 @@ class CloudStorageViewSet(auth.CloudStorageGetQuerySetMixin, viewsets.ModelViewS
            storage = get_cloud_storage_instance(cloud_provider=db_storage.provider_type, **details)
            if not db_storage.manifests.count():
                raise Exception('There is no manifest file')
-            manifest_path = request.query_params.get('manifest_path', 'manifest.jsonl')
+            manifest_path = request.query_params.get('manifest_path', db_storage.manifests.first().filename)
            file_status = storage.get_file_status(manifest_path)
            if file_status == Status.NOT_FOUND:
                raise FileNotFoundError(errno.ENOENT,

--- a/utils/dataset_manifest/__init__.py
+++ b/utils/dataset_manifest/__init__.py
 # Copyright (C) 2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
-from .core import VideoManifestManager, ImageManifestManager
\ No newline at end of file
+from .core import VideoManifestManager, ImageManifestManager, is_manifest
\ No newline at end of file
--- a/utils/dataset_manifest/core.py
+++ b/utils/dataset_manifest/core.py
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: MIT

+from enum import Enum
 import av
 import json
 import os
-from abc import ABC, abstractmethod, abstractproperty
+
+from abc import ABC, abstractmethod, abstractproperty, abstractstaticmethod
 from contextlib import closing
 from tempfile import NamedTemporaryFile
-
 from PIL import Image
-from .utils import md5_hash, rotate_image, sort, SortingMethod
+from json.decoder import JSONDecodeError
+
+from .utils import SortingMethod, md5_hash, rotate_image, sort

 class VideoStreamReader:
    def __init__(self, source_path, chunk_size, force):
@@ -238,8 +241,19 @@ class Dataset3DImagesReader(DatasetImagesReader):
                yield dict()

 class _Manifest:
+    class SupportedVersion(str, Enum):
+        V1 = '1.0'
+        V1_1 = '1.1'
+
+        @classmethod
+        def choices(cls):
+            return (x.value for x in cls)
+
+        def __str__(self):
+            return self.value
+
    FILE_NAME = 'manifest.jsonl'
-    VERSION = '1.1'
+    VERSION = SupportedVersion.V1_1

    def __init__(self, path, upload_dir=None):
        assert path, 'A path to manifest file not found'
@@ -509,13 +523,6 @@ class VideoManifestManager(_ManifestManager):
    def get_subset(self, subset_names):
        raise NotImplementedError()

-#TODO: add generic manifest structure file validation
-class ManifestValidator:
-    def validate_base_info(self):
-        with open(self._manifest.path, 'r') as manifest_file:
-            assert self._manifest.VERSION != json.loads(manifest_file.readline())['version']
-            assert self._manifest.TYPE != json.loads(manifest_file.readline())['type']
-
 class VideoManifestValidator(VideoManifestManager):
    def __init__(self, source_path, manifest_path):
        self._source_path = source_path
@@ -607,12 +614,113 @@ class ImageManifestManager(_ManifestManager):
            image_name = f"{image['name']}{image['extension']}"
            if image_name in subset_names:
                index_list.append(subset_names.index(image_name))
-                subset.append({
+                properties = {
                    'name': f"{image['name']}",
                    'extension': f"{image['extension']}",
                    'width': image['width'],
                    'height': image['height'],
-                    'meta': image['meta'],
-                    'checksum': f"{image['checksum']}"
-                })
+                }
+                for optional_field in {'meta', 'checksum'}:
+                    value = image.get(optional_field)
+                    if value:
+                        properties[optional_field] =  value
+                subset.append(properties)
        return index_list, subset
+
+
+class _BaseManifestValidator(ABC):
+    def __init__(self, full_manifest_path):
+        self._manifest = _Manifest(full_manifest_path)
+
+    def validate(self):
+        try:
+            # we cannot use index in general because manifest may be e.g. in share point with ro mode
+            with open(self._manifest.path, 'r') as manifest:
+                for validator in self.validators:
+                    line = json.loads(manifest.readline().strip())
+                    validator(line)
+            return True
+        except (ValueError, KeyError, JSONDecodeError):
+            return False
+
+    @staticmethod
+    def _validate_version(_dict):
+        if not _dict['version'] in _Manifest.SupportedVersion.choices():
+            raise ValueError('Incorrect version field')
+
+    def _validate_type(self, _dict):
+        if not _dict['type'] == self.TYPE:
+            raise ValueError('Incorrect type field')
+
+    @abstractproperty
+    def validators(self):
+        pass
+
+    @abstractstaticmethod
+    def _validate_first_item(_dict):
+        pass
+
+class _VideoManifestStructureValidator(_BaseManifestValidator):
+    TYPE = 'video'
+
+    @property
+    def validators(self):
+        return (
+            self._validate_version,
+            self._validate_type,
+            self._validate_properties,
+            self._validate_first_item,
+        )
+
+    @staticmethod
+    def _validate_properties(_dict):
+        properties = _dict['properties']
+        if not isinstance(properties['name'], str):
+            raise ValueError('Incorrect name field')
+        if not isinstance(properties['resolution'], list):
+            raise ValueError('Incorrect resolution field')
+        if not isinstance(properties['length'], int) or properties['length'] == 0:
+            raise ValueError('Incorrect length field')
+
+    @staticmethod
+    def _validate_first_item(_dict):
+        if not isinstance(_dict['number'], int):
+            raise ValueError('Incorrect number field')
+        if not isinstance(_dict['pts'], int):
+            raise ValueError('Incorrect pts field')
+
+class _DatasetManifestStructureValidator(_BaseManifestValidator):
+    TYPE = 'images'
+
+    @property
+    def validators(self):
+        return (
+            self._validate_version,
+            self._validate_type,
+            self._validate_first_item,
+        )
+
+    @staticmethod
+    def _validate_first_item(_dict):
+        if not isinstance(_dict['name'], str):
+            raise ValueError('Incorrect name field')
+        if not isinstance(_dict['extension'], str):
+            raise ValueError('Incorrect extension field')
+        # width and height are required for 2d data
+        # FIXME for 3d when manual preparation of the manifest will be implemented
+        if not isinstance(_dict['width'], int):
+            raise ValueError('Incorrect width field')
+        if not isinstance(_dict['height'], int):
+            raise ValueError('Incorrect height field')
+
+def is_manifest(full_manifest_path):
+    return _is_video_manifest(full_manifest_path) or \
+        _is_dataset_manifest(full_manifest_path)
+
+def _is_video_manifest(full_manifest_path):
+    validator = _VideoManifestStructureValidator(full_manifest_path)
+    return validator.validate()
+
+def _is_dataset_manifest(full_manifest_path):
+    validator = _DatasetManifestStructureValidator(full_manifest_path)
+    return validator.validate()
\ No newline at end of file