librispeech.py 6.3 KB
Newer Older
1
"""Prepare Librispeech ASR datasets.
X
Xinghai Sun 已提交
2

3 4 5 6
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
X
Xinghai Sun 已提交
7
"""
8 9 10
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
X
Xinghai Sun 已提交
11

12
import distutils.util
13
import os
14
import sys
15 16 17 18
import tarfile
import argparse
import soundfile
import json
19
import codecs
20
from paddle.v2.dataset.common import md5file
21

22
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
23

24 25 26 27 28 29 30 31 32 33
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
34
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
35
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
36
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
37 38
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
39
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
40

41
parser = argparse.ArgumentParser(description=__doc__)
42 43 44 45
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Libri",
    type=str,
46
    help="Directory to save the dataset. (default: %(default)s)")
47
parser.add_argument(
48
    "--manifest_prefix",
49
    default="manifest",
50
    type=str,
51
    help="Filepath prefix for output manifests. (default: %(default)s)")
52 53 54 55 56 57 58
parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
59 60 61
args = parser.parse_args()


62 63 64 65 66
def download(url, md5sum, target_dir):
    """
    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
67
    filepath = os.path.join(target_dir, url.split("/")[-1])
68
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
69
        print("Downloading %s ..." % url)
70
        os.system("wget -c " + url + " -P " + target_dir)
71
        print("\nMD5 Chesksum %s ..." % filepath)
72 73 74 75
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
76 77 78 79
    return filepath


def unpack(filepath, target_dir):
80 81 82
    """
    Unpack the file to the target_dir.
    """
83 84 85 86 87 88 89
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()


def create_manifest(data_dir, manifest_path):
90
    """
91 92 93
    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
94
    """
95 96
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
97
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
116
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
117 118 119 120
        for line in json_lines:
            out_file.write(line + '\n')


121
def prepare_dataset(url, md5sum, target_dir, manifest_path):
122 123 124
    """
    Download, unpack and create summmary manifest file.
    """
125
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
126 127 128
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
129 130
        unpack(filepath, target_dir)
    else:
131 132 133
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
134
    create_manifest(target_dir, manifest_path)
135 136 137 138


def main():
    prepare_dataset(
139 140 141
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
142
        manifest_path=args.manifest_prefix + ".test-clean")
143
    prepare_dataset(
144 145 146
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
147
        manifest_path=args.manifest_prefix + ".dev-clean")
X
Xinghai Sun 已提交
148
    prepare_dataset(
149 150 151
        url=URL_TRAIN_CLEAN_100,
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
152
        manifest_path=args.manifest_prefix + ".train-clean-100")
153 154 155 156 157
    if args.full_download:
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
158
            manifest_path=args.manifest_prefix + ".test-other")
159 160 161 162
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
163
            manifest_path=args.manifest_prefix + ".dev-other")
164 165 166 167
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
168
            manifest_path=args.manifest_prefix + ".train-clean-360")
169 170 171 172
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
173
            manifest_path=args.manifest_prefix + ".train-other-500")
174 175 176 177


if __name__ == '__main__':
    main()