librispeech.py 4.7 KB
Newer Older
X
Xinghai Sun 已提交
1 2 3 4 5 6 7 8
"""
   Download, unpack and create manifest for Librespeech dataset.

   Manifest is a json file with each line containing one audio clip filepath,
   its transcription text string, and its duration. It servers as a unified
   interfance to organize different data sets.
"""

9
import paddle.v2 as paddle
10
from paddle.v2.dataset.common import md5file
11 12 13 14 15 16 17
import os
import wget
import tarfile
import argparse
import soundfile
import json

18
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
19

20 21 22 23 24 25 26 27 28 29 30 31 32
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
33
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
34 35 36 37 38 39 40

parser = argparse.ArgumentParser(
    description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Libri",
    type=str,
41
    help="Directory to save the dataset. (default: %(default)s)")
42
parser.add_argument(
43 44
    "--manifest_prefix",
    default="manifest.libri",
45
    type=str,
46
    help="Filepath prefix for output manifests. (default: %(default)s)")
47 48 49
args = parser.parse_args()


50 51 52 53 54
def download(url, md5sum, target_dir):
    """
    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
55
    filepath = os.path.join(target_dir, url.split("/")[-1])
56
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
57 58
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
59 60
        print("\nMD5 Chesksum %s ..." % filepath)
        assert md5file(filepath) == md5sum, "MD5 checksum failed."
61 62 63 64
    return filepath


def unpack(filepath, target_dir):
65 66 67
    """
    Unpack the file to the target_dir.
    """
68 69 70 71 72 73 74 75
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    return target_dir


def create_manifest(data_dir, manifest_path):
76 77 78 79 80 81 82 83
    """
    Create a manifest file summarizing the dataset (list of filepath and meta
    data).

    Each line of the manifest contains one audio clip filepath, its
    transcription text string, and its duration. Manifest file servers as a
    unified interfance to organize data sets.
    """
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in os.walk(data_dir):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with open(manifest_path, 'w') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')


110 111 112 113 114
def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """
    Download, unpack and create summmary manifest file.
    """
    filepath = download(url, md5sum, target_dir)
115 116 117 118 119 120
    unpacked_dir = unpack(filepath, target_dir)
    create_manifest(unpacked_dir, manifest_path)


def main():
    prepare_dataset(
121 122 123 124
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
        manifest_path=args.manifest_prefix + ".test-clean")
125
    prepare_dataset(
126 127 128 129
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
X
Xinghai Sun 已提交
130
    prepare_dataset(
131 132 133 134
        url=URL_TRAIN_CLEAN_100,
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
        manifest_path=args.manifest_prefix + ".train-clean-100")
135 136 137 138


if __name__ == '__main__':
    main()