librispeech.py 5.5 KB
Newer Older
1
"""Prepare Librispeech ASR datasets.
X
Xinghai Sun 已提交
2

3 4 5 6
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
X
Xinghai Sun 已提交
7
"""
8 9 10
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
X
Xinghai Sun 已提交
11

12
import distutils.util
13
import os
14
import sys
15 16 17
import argparse
import soundfile
import json
18
import codecs
19
from paddle.v2.dataset.common import md5file
Y
yangyaming 已提交
20
from data_utils.utility import download, unpack
21

22
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
23

24 25 26 27 28 29 30 31 32 33
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
34
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
35
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
36
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
37 38
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
39
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
40

41
parser = argparse.ArgumentParser(description=__doc__)
42 43 44 45
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Libri",
    type=str,
46
    help="Directory to save the dataset. (default: %(default)s)")
47
parser.add_argument(
48
    "--manifest_prefix",
49
    default="manifest",
50
    type=str,
51
    help="Filepath prefix for output manifests. (default: %(default)s)")
52 53 54 55 56 57 58
parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
59 60 61 62
args = parser.parse_args()


def create_manifest(data_dir, manifest_path):
63
    """
64 65 66
    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
67
    """
68 69
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
70
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
89
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
90 91 92 93
        for line in json_lines:
            out_file.write(line + '\n')


94
def prepare_dataset(url, md5sum, target_dir, manifest_path):
95 96 97
    """
    Download, unpack and create summmary manifest file.
    """
98
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
99 100 101
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
102 103
        unpack(filepath, target_dir)
    else:
104 105 106
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
107
    create_manifest(target_dir, manifest_path)
108 109 110 111


def main():
    prepare_dataset(
112 113 114
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
115
        manifest_path=args.manifest_prefix + ".test-clean")
116
    prepare_dataset(
117 118 119
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
120
        manifest_path=args.manifest_prefix + ".dev-clean")
X
Xinghai Sun 已提交
121
    prepare_dataset(
122 123 124
        url=URL_TRAIN_CLEAN_100,
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
125
        manifest_path=args.manifest_prefix + ".train-clean-100")
126 127 128 129 130
    if args.full_download:
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
131
            manifest_path=args.manifest_prefix + ".test-other")
132 133 134 135
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
136
            manifest_path=args.manifest_prefix + ".dev-other")
137 138 139 140
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
141
            manifest_path=args.manifest_prefix + ".train-clean-360")
142 143 144 145
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
146
            manifest_path=args.manifest_prefix + ".train-other-500")
147 148 149 150


if __name__ == '__main__':
    main()