utility.py 3.0 KB
Newer Older
1
"""Contains data helper functions."""
2 3 4 5 6
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
7
import codecs
Y
yangyaming 已提交
8 9
import os
import tarfile
10 11
import time
from Queue import Queue
12
from threading import Thread
13
from multiprocessing import Process, Manager, Value
L
lfchener 已提交
14
from paddle.dataset.common import md5file
15 16 17


def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
18
    """Load and parse manifest file.
19

20 21 22
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.

23
    :param manifest_path: Manifest file to load and parse.
24 25 26 27 28 29 30 31 32
    :type manifest_path: basestring
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
    :param min_duration: Minimal duration in seconds for instance filter.
    :type min_duration: float
    :return: Manifest parsing results. List of dict.
    :rtype: list
    :raises IOError: If failed to parse the manifest.
    """
33
    manifest = []
34
    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
35 36 37 38 39 40 41 42
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
        if (json_data["duration"] <= max_duration and
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
Y
yangyaming 已提交
43 44


45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)
    directory, filename = (directory or '.'), filename.lower()
    for f in os.listdir(directory):
        newpath = os.path.join(directory, f)
        if os.path.isfile(newpath) and f.lower() == filename:
            return newpath


def download_multi(url, target_dir, extra_args):
    """Download multiple files from url to target_dir."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    print("Downloading %s ..." % url)
    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
                         target_dir)
    return ret_code


Y
yangyaming 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
def download(url, md5sum, target_dir):
    """Download file from url to target_dir, and check md5sum."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        os.system("wget -c " + url + " -P " + target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath


def unpack(filepath, target_dir, rm_tar=False):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    if rm_tar == True:
        os.remove(filepath)
87 88 89 90


class XmapEndSignal():
    pass