librispeech.py 6.3 KB
Newer Older
X
Xinghai Sun 已提交
1
"""
2
    Download, unpack and create manifest json files for the Librespeech dataset.
X
Xinghai Sun 已提交
3

4 5 6
    A manifest is a json file summarizing filelist in a data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file in the data set.
X
Xinghai Sun 已提交
7 8
"""

9
import paddle.v2 as paddle
10
from paddle.v2.dataset.common import md5file
11
import distutils.util
12 13 14 15 16 17 18
import os
import wget
import tarfile
import argparse
import soundfile
import json

19
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
20

21 22 23 24 25 26 27 28 29 30
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
31
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
32
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
33
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
34 35
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
36
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
37 38 39 40 41 42 43

parser = argparse.ArgumentParser(
    description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Libri",
    type=str,
44
    help="Directory to save the dataset. (default: %(default)s)")
45
parser.add_argument(
46
    "--manifest_prefix",
47
    default="manifest",
48
    type=str,
49
    help="Filepath prefix for output manifests. (default: %(default)s)")
50 51 52 53 54 55 56
parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
57 58 59
args = parser.parse_args()


60 61 62 63 64
def download(url, md5sum, target_dir):
    """
    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
65
    filepath = os.path.join(target_dir, url.split("/")[-1])
66
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
67 68
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
69
        print("\nMD5 Chesksum %s ..." % filepath)
70 71 72 73
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
74 75 76 77
    return filepath


def unpack(filepath, target_dir):
78 79 80
    """
    Unpack the file to the target_dir.
    """
81 82 83 84 85 86 87
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()


def create_manifest(data_dir, manifest_path):
88
    """
89 90 91
    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
92
    """
93 94
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
95
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with open(manifest_path, 'w') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')


119
def prepare_dataset(url, md5sum, target_dir, manifest_path):
120 121 122
    """
    Download, unpack and create summmary manifest file.
    """
123
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
124 125 126
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
127 128
        unpack(filepath, target_dir)
    else:
129 130 131
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
132
    create_manifest(target_dir, manifest_path)
133 134 135 136


def main():
    prepare_dataset(
137 138 139
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
140
        manifest_path=args.manifest_prefix + ".test-clean")
141
    prepare_dataset(
142 143 144
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
145
        manifest_path=args.manifest_prefix + ".dev-clean")
X
Xinghai Sun 已提交
146
    prepare_dataset(
147 148 149
        url=URL_TRAIN_CLEAN_100,
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
150
        manifest_path=args.manifest_prefix + ".train-clean-100")
151 152 153 154 155
    if args.full_download:
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
156
            manifest_path=args.manifest_prefix + ".test-other")
157 158 159 160
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
161
            manifest_path=args.manifest_prefix + ".dev-other")
162 163 164 165
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
166
            manifest_path=args.manifest_prefix + ".train-clean-360")
167 168 169 170
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
171
            manifest_path=args.manifest_prefix + ".train-other-500")
172 173 174 175


if __name__ == '__main__':
    main()