提交 58fa8790 编写于 作者: Y yangyaming

Extract common utility functions.

上级 cd16af88
...@@ -12,12 +12,12 @@ from __future__ import print_function ...@@ -12,12 +12,12 @@ from __future__ import print_function
import distutils.util import distutils.util
import os import os
import sys import sys
import tarfile
import argparse import argparse
import soundfile import soundfile
import json import json
import codecs import codecs
from paddle.v2.dataset.common import md5file from paddle.v2.dataset.common import md5file
from data_utils.utility import download, unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
...@@ -59,33 +59,6 @@ parser.add_argument( ...@@ -59,33 +59,6 @@ parser.add_argument(
args = parser.parse_args() args = parser.parse_args()
def download(url, md5sum, target_dir):
"""
Download file from url to target_dir, and check md5sum.
"""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
os.system("wget -c " + url + " -P " + target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir):
"""
Unpack the file to the target_dir.
"""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
def create_manifest(data_dir, manifest_path): def create_manifest(data_dir, manifest_path):
""" """
Create a manifest json file summarizing the data set, with each line Create a manifest json file summarizing the data set, with each line
......
...@@ -5,6 +5,8 @@ from __future__ import print_function ...@@ -5,6 +5,8 @@ from __future__ import print_function
import json import json
import codecs import codecs
import os
import tarfile
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
...@@ -33,3 +35,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): ...@@ -33,3 +35,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
json_data["duration"] >= min_duration): json_data["duration"] >= min_duration):
manifest.append(json_data) manifest.append(json_data)
return manifest return manifest
def download(url, md5sum, target_dir):
"""Download file from url to target_dir, and check md5sum."""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
os.system("wget -c " + url + " -P " + target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir, rm_tar=False):
"""Unpack the file to the target_dir."""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
if rm_tar == True:
os.remove(filepath)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册