librispeech.py 6.4 KB
Newer Older
1
"""Prepare Librispeech ASR datasets.
X
Xinghai Sun 已提交
2

3 4 5 6
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
X
Xinghai Sun 已提交
7
"""
8 9 10
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
X
Xinghai Sun 已提交
11

12
import distutils.util
13
import os
14
import sys
15 16 17 18
import tarfile
import argparse
import soundfile
import json
19
import codecs
20
from paddle.v2.dataset.common import md5file
21

22 23 24 25 26 27 28 29 30 31
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
32
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
33
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
34
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
35 36
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
37
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
38

39
parser = argparse.ArgumentParser(description=__doc__)
40 41
parser.add_argument(
    "--target_dir",
42
    default='~/.cache/paddle/dataset/speech/libri',
43
    type=str,
44
    help="Directory to save the dataset. (default: %(default)s)")
45
parser.add_argument(
46
    "--manifest_prefix",
47
    default="manifest",
48
    type=str,
49
    help="Filepath prefix for output manifests. (default: %(default)s)")
50 51 52 53 54 55 56
parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
57 58 59
args = parser.parse_args()


60
def download(url, md5sum, target_dir):
61
    """Download file from url to target_dir, and check md5sum.
62 63
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
64
    filepath = os.path.join(target_dir, url.split("/")[-1])
65
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
66
        print("Downloading %s ..." % url)
67 68
        ret = os.system("wget -c " + url + " -P " + target_dir)
        print(ret)
69
        print("\nMD5 Chesksum %s ..." % filepath)
70 71 72 73
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
74 75 76 77
    return filepath


def unpack(filepath, target_dir):
78
    """Unpack the file to the target_dir.
79
    """
80 81 82 83 84 85 86
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()


def create_manifest(data_dir, manifest_path):
87
    """Create a manifest json file summarizing the data set, with each line
88 89
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
90
    """
91 92
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
93
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
112
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
113 114 115 116
        for line in json_lines:
            out_file.write(line + '\n')


117
def prepare_dataset(url, md5sum, target_dir, manifest_path):
118
    """Download, unpack and create summmary manifest file.
119
    """
120
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
121 122 123
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
124 125
        unpack(filepath, target_dir)
    else:
126 127 128
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
129
    create_manifest(target_dir, manifest_path)
130 131 132


def main():
133 134
    args.target_dir = os.path.expanduser(args.target_dir)

135
    prepare_dataset(
136 137 138
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
139
        manifest_path=args.manifest_prefix + ".test-clean")
140
    prepare_dataset(
141 142 143
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
144
        manifest_path=args.manifest_prefix + ".dev-clean")
145
    if args.full_download:
146 147 148 149 150
        prepare_dataset(
            url=URL_TRAIN_CLEAN_100,
            md5sum=MD5_TRAIN_CLEAN_100,
            target_dir=os.path.join(args.target_dir, "train-clean-100"),
            manifest_path=args.manifest_prefix + ".train-clean-100")
151 152 153 154
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
155
            manifest_path=args.manifest_prefix + ".test-other")
156 157 158 159
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
160
            manifest_path=args.manifest_prefix + ".dev-other")
161 162 163 164
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
165
            manifest_path=args.manifest_prefix + ".train-clean-360")
166 167 168 169
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
170
            manifest_path=args.manifest_prefix + ".train-other-500")
171 172 173 174


if __name__ == '__main__':
    main()