utility.py 2.2 KB
Newer Older
1
"""Contains data helper functions."""
2 3 4 5 6
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
7
import codecs
Y
yangyaming 已提交
8 9
import os
import tarfile
Y
yangyaming 已提交
10
from paddle.v2.dataset.common import md5file
11 12 13


def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
14
    """Load and parse manifest file.
15

16 17 18
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.

19
    :param manifest_path: Manifest file to load and parse.
20 21 22 23 24 25 26 27 28
    :type manifest_path: basestring
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
    :param min_duration: Minimal duration in seconds for instance filter.
    :type min_duration: float
    :return: Manifest parsing results. List of dict.
    :rtype: list
    :raises IOError: If failed to parse the manifest.
    """
29
    manifest = []
30
    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
31 32 33 34 35 36 37 38
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
        if (json_data["duration"] <= max_duration and
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
Y
yangyaming 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63


def download(url, md5sum, target_dir):
    """Download file from url to target_dir, and check md5sum."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        os.system("wget -c " + url + " -P " + target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath


def unpack(filepath, target_dir, rm_tar=False):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    if rm_tar == True:
        os.remove(filepath)