Manifest optimization (#3712)

5b890b17 · Maria Khrustaleva · GitHub · cf6878e1 · 5b890b17 · 5b890b17
7 changed file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,10 +12,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - cvat-ui: support cloud storages (<https://github.com/openvinotoolkit/cvat/pull/3372>)
 - interactor: add HRNet interactive segmentation serverless function (<https://github.com/openvinotoolkit/cvat/pull/3740>)
 - Added GPU implementation for SiamMask, reworked tracking approach (<https://github.com/openvinotoolkit/cvat/pull/3571>)
+- Progress bar for manifest creating (<https://github.com/openvinotoolkit/cvat/pull/3712>)

 ### Changed

 - UI tracking has been reworked (<https://github.com/openvinotoolkit/cvat/pull/3571>)
+- Manifest generation: Reduce creating time (<https://github.com/openvinotoolkit/cvat/pull/3712>)

 ### Deprecated


--- a/cvat/apps/engine/migrations/0038_manifest.py
+++ b/cvat/apps/engine/migrations/0038_manifest.py
@@ -43,10 +43,11 @@ def migrate2meta(apps, shema_editor):
                    continue
                media_file = os.path.join(data_dir, db_data.video.path)
                logger.info('Preparing of the video meta has begun')
-                meta = VideoManifestManager(manifest_path=upload_dir) \
-                    .prepare_meta(media_file=media_file, force=True)
+                manifest = VideoManifestManager(manifest_path=upload_dir)
+                manifest.link(media_file=media_file, force=True)
+                manifest.init_index()
                with open(meta_path, "w") as meta_file:
-                    for idx, pts, _ in meta:
+                    for idx, pts, _ in manifest.reader:
                        meta_file.write(f"{idx} {pts}\n")
            else:
                name_format = "dummy_{}.txt"
@@ -87,12 +88,9 @@ def migrate2manifest(apps, shema_editor):
            if hasattr(db_data, 'video'):
                media_file = os.path.join(data_dir, db_data.video.path)
                manifest = VideoManifestManager(manifest_path=upload_dir)
-                logger.info('Preparing of the video meta information has begun')
-                meta_info = manifest.prepare_meta(media_file=media_file, force=True)
+                manifest.link(media_file=media_file, force=True)
                logger.info('Manifest creating has begun')
-                manifest.create(meta_info)
-                logger.info('Index creating has begun')
-                manifest.init_index()
+                manifest.create()
            else:
                manifest = ImageManifestManager(manifest_path=upload_dir)
                sources = []
@@ -105,36 +103,21 @@ def migrate2manifest(apps, shema_editor):
                    sources = [os.path.join(data_dir, db_image.path) for db_image in db_data.images.all().order_by('frame')]
                if any(list(filter(lambda x: x.dimension==DimensionType.DIM_3D, db_data.tasks.all()))):
                    logger.info('Preparing of images 3d meta information has begun')
-                    content = []
-                    for source in sources:
-                        name, ext = os.path.splitext(os.path.relpath(source, upload_dir))
-                        content.append({
-                            'name': name,
-                            'extension': ext
-                        })
+                    manifest.link(sources=sources, data_dir=data_dir, DIM_3D=True)
                else:
                    logger.info('Preparing of 2d images meta information has begun')
-                    meta_info = manifest.prepare_meta(sources=sources, data_dir=data_dir)
-                    content = meta_info.content
+                    manifest.link(sources=sources, data_dir=data_dir)

                if db_data.storage == StorageChoice.SHARE:
                    def _get_frame_step(str_):
                        match = search("step\s*=\s*([1-9]\d*)", str_)
                        return int(match.group(1)) if match else 1
                    logger.info('Data is located on the share, metadata update has been started')
-                    step = _get_frame_step(db_data.frame_filter)
-                    start = db_data.start_frame
-                    stop = db_data.stop_frame + 1
-                    images_range = range(start, stop, step)
-                    result_content = []
-                    for i in range(stop):
-                        item = content.pop(0) if i in images_range else dict()
-                        result_content.append(item)
-                    content = result_content
+                    manifest.step = _get_frame_step(db_data.frame_filter)
+                    manifest.start = db_data.start_frame
+                    manifest.stop = db_data.stop_frame + 1
                logger.info('Manifest creating has begun')
-                manifest.create(content)
-                logger.info('Index creating has begun')
-                manifest.init_index()
+                manifest.create()
            logger.info('Succesfull migration for the data({})'.format(db_data.id))
        except Exception as ex:
            logger.error(str(ex))

--- a/cvat/apps/engine/task.py
+++ b/cvat/apps/engine/task.py
@@ -265,7 +265,6 @@ def _create_thread(tid, data, isImport=False):
            media_files = sorted(media['image'])
            content = cloud_storage_manifest.get_subset(media_files)
            manifest.create(content)
-            manifest.init_index()

    av_scan_paths(upload_dir)

@@ -424,8 +423,7 @@ def _create_thread(tid, data, isImport=False):
                            video_size = manifest.video_resolution
                            manifest_is_prepared = True
                        except Exception as ex:
-                            if os.path.exists(db_data.get_index_path()):
-                                os.remove(db_data.get_index_path())
+                            manifest.remove()
                            if isinstance(ex, AssertionError):
                                base_msg = str(ex)
                            else:
@@ -436,17 +434,16 @@ def _create_thread(tid, data, isImport=False):
                    if not manifest_is_prepared:
                        _update_status('Start prepare a manifest file')
                        manifest = VideoManifestManager(db_data.get_manifest_path())
-                        meta_info = manifest.prepare_meta(
+                        manifest.link(
                            media_file=media_files[0],
                            upload_dir=upload_dir,
                            chunk_size=db_data.chunk_size
                        )
-                        manifest.create(meta_info)
-                        manifest.init_index()
+                        manifest.create()
                        _update_status('A manifest had been created')

-                        all_frames = meta_info.get_size()
-                        video_size = meta_info.frame_sizes
+                        all_frames = len(manifest.reader)
+                        video_size = manifest.reader.resolution
                        manifest_is_prepared = True

                    db_data.size = len(range(db_data.start_frame, min(data['stop_frame'] + 1 \
@@ -454,10 +451,8 @@ def _create_thread(tid, data, isImport=False):
                    video_path = os.path.join(upload_dir, media_files[0])
                except Exception as ex:
                    db_data.storage_method = models.StorageMethodChoice.FILE_SYSTEM
-                    if os.path.exists(db_data.get_manifest_path()):
-                        os.remove(db_data.get_manifest_path())
-                    if os.path.exists(db_data.get_index_path()):
-                        os.remove(db_data.get_index_path())
+                    manifest.remove()
+                    del manifest
                    base_msg = str(ex) if isinstance(ex, AssertionError) \
                        else "Uploaded video does not support a quick way of task creating."
                    _update_status("{} The task will be created using the old method".format(base_msg))
@@ -465,24 +460,15 @@ def _create_thread(tid, data, isImport=False):
                db_data.size = len(extractor)
                manifest = ImageManifestManager(db_data.get_manifest_path())
                if not manifest_file:
-                    if db_task.dimension == models.DimensionType.DIM_2D:
-                        meta_info = manifest.prepare_meta(
-                            sources=extractor.absolute_source_paths,
-                            meta={ k: {'related_images': related_images[k] } for k in related_images },
-                            data_dir=upload_dir
-                        )
-                        content = meta_info.content
-                    else:
-                        content = []
-                        for source in extractor.absolute_source_paths:
-                            name, ext = os.path.splitext(os.path.relpath(source, upload_dir))
-                            content.append({
-                                'name': name,
-                                'meta': { 'related_images': related_images[''.join((name, ext))] },
-                                'extension': ext
-                            })
-                    manifest.create(content)
-                manifest.init_index()
+                    manifest.link(
+                        sources=extractor.absolute_source_paths,
+                        meta={ k: {'related_images': related_images[k] } for k in related_images },
+                        data_dir=upload_dir,
+                        DIM_3D=(db_task.dimension == models.DimensionType.DIM_3D),
+                    )
+                    manifest.create()
+                else:
+                    manifest.init_index()
                counter = itertools.count()
                for _, chunk_frames in itertools.groupby(extractor.frame_range, lambda x: next(counter) // db_data.chunk_size):
                    chunk_paths = [(extractor.get_path(i), i) for i in chunk_frames]

--- a/cvat/apps/engine/tests/test_rest_api.py
+++ b/cvat/apps/engine/tests/test_rest_api.py
@@ -2512,11 +2512,11 @@ def generate_manifest_file(data_type, manifest_path, sources):
    }

    if data_type == 'video':
-        manifest = VideoManifestManager(manifest_path)
+        manifest = VideoManifestManager(manifest_path, create_index=False)
    else:
-        manifest = ImageManifestManager(manifest_path)
-    prepared_meta = manifest.prepare_meta(**kwargs[data_type])
-    manifest.create(prepared_meta)
+        manifest = ImageManifestManager(manifest_path, create_index=False)
+    manifest.link(**kwargs[data_type])
+    manifest.create()

 class TaskDataAPITestCase(APITestCase):
    _image_sizes = {}

--- a/utils/dataset_manifest/core.py
+++ b/utils/dataset_manifest/core.py
--- a/utils/dataset_manifest/create.py
+++ b/utils/dataset_manifest/create.py
@@ -6,6 +6,7 @@ import os
 import sys
 import re
 from glob import glob
+from tqdm import tqdm

 from utils import detect_related_images, is_image, is_video

@@ -61,17 +62,18 @@ def main():
        try:
            assert len(sources), 'A images was not found'
            manifest = ImageManifestManager(manifest_path=manifest_directory)
-            meta_info = manifest.prepare_meta(sources=sources, meta=meta, is_sorted=False,
-                use_image_hash=True, data_dir=data_dir)
-            manifest.create(meta_info)
+            manifest.link(sources=sources, meta=meta, is_sorted=False,
+                    use_image_hash=True, data_dir=data_dir)
+            manifest.create(_tqdm=tqdm)
        except Exception as ex:
            sys.exit(str(ex))
    else: # video
        try:
            assert is_video(source), 'You can specify a video path or a directory/pattern with images'
            manifest = VideoManifestManager(manifest_path=manifest_directory)
+            manifest.link(media_file=source, force=args.force)
            try:
-                meta_info = manifest.prepare_meta(media_file=source, force=args.force)
+                manifest.create(_tqdm=tqdm)
            except AssertionError as ex:
                if str(ex) == 'Too few keyframes':
                    msg = 'NOTE: prepared manifest file contains too few key frames for smooth decoding.\n' \
@@ -80,7 +82,6 @@ def main():
                    sys.exit(2)
                else:
                    raise
-            manifest.create(meta_info)
        except Exception as ex:
            sys.exit(str(ex))


--- a/utils/dataset_manifest/requirements.txt
+++ b/utils/dataset_manifest/requirements.txt
 av==8.0.2 --no-binary=av
 opencv-python-headless==4.4.0.42
-Pillow==7.2.0
\ No newline at end of file
+Pillow==7.2.0
+tqdm==4.58.0
\ No newline at end of file