From d3eeb7fd76f8b9f86ca01e80f524dde652211428 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 7 Jun 2017 17:44:11 +0800 Subject: [PATCH] Refine librispeech.py for DeepSpeech2. Summary: 1. Add manifest line check. 2. Avoid re-unpacking if unpacked data already exists. 3. Add full_download (download all 7 sub-datasets of LibriSpeech). --- README.md | 5 ++- data/librispeech.py | 90 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index bb1815c0..403511d5 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added. ``` cd data python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` @@ -32,13 +33,13 @@ python librispeech.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all ``` More help for arguments: diff --git a/data/librispeech.py b/data/librispeech.py index 838fee59..8bc33575 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -1,13 +1,15 @@ """ - Download, unpack and create manifest for Librespeech dataset. + Download, unpack and create manifest file for the Librespeech dataset. - Manifest is a json file with each line containing one audio clip filepath, - its transcription text string, and its duration. It servers as a unified - interfance to organize different data sets. + A manifest file is a dataset summarization, with each line a json format + string containing meta data for one audio clip, including its filepath, + transcription string, and duration. It serves as a unified interface for + different data sets. """ import paddle.v2 as paddle from paddle.v2.dataset.common import md5file +import distutils.util import os import wget import tarfile @@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" +NUM_LINES_TEST_CLEAN = 2620 +NUM_LINES_TEST_OTHER = 2939 +NUM_LINES_DEV_CLEAN = 2703 +NUM_LINES_DEV_OTHER = 2864 +NUM_LINES_TRAIN_CLEAN_100 = 28539 +NUM_LINES_TRAIN_CLEAN_360 = 104014 +NUM_LINES_TRAIN_OTHER_500 = 148688 + parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -44,6 +56,13 @@ parser.add_argument( default="manifest.libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") args = parser.parse_args() @@ -57,7 +76,10 @@ def download(url, md5sum, target_dir): print("Downloading %s ..." % url) wget.download(url, target_dir) print("\nMD5 Chesksum %s ..." % filepath) - assert md5file(filepath) == md5sum, "MD5 checksum failed." + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) return filepath @@ -69,7 +91,6 @@ def unpack(filepath, target_dir): tar = tarfile.open(filepath) tar.extractall(target_dir) tar.close() - return target_dir def create_manifest(data_dir, manifest_path): @@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] - for subfolder, _, filelist in os.walk(data_dir): + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') ] @@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def prepare_dataset(url, md5sum, target_dir, manifest_path): +def verify_file_line_number(filepath, num_lines): + with open(filepath, 'r') as file: + return len(file.readlines()) == num_lines + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): """ Download, unpack and create summmary manifest file. """ + # download filepath = download(url, md5sum, target_dir) - unpacked_dir = unpack(filepath, target_dir) - create_manifest(unpacked_dir, manifest_path) + # unpack + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + unpack(filepath, target_dir) + else: + print("Unpacked data exists, skip unpacking.") + # create manifest and verify line number + create_manifest(target_dir, manifest_path) + if not verify_file_line_number(manifest_path, num_lines): + raise RuntimeError("Manifest line number check failed. " + "Please remove directory and try running the script " + "again.") def main(): @@ -121,17 +157,45 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean") + manifest_path=args.manifest_prefix + ".test-clean", + num_lines=NUM_LINES_TEST_CLEAN) prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean") + manifest_path=args.manifest_prefix + ".dev-clean", + num_lines=NUM_LINES_DEV_CLEAN) prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100") + manifest_path=args.manifest_prefix + ".train-clean-100", + num_lines=NUM_LINES_TRAIN_CLEAN_100) + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other", + num_lines=NUM_LINES_TEST_OTHER) + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other", + num_lines=NUM_LINES_DEV_OTHER) + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360", + num_lines=NUM_LINES_TRAIN_CLEAN_360) + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500", + num_lines=NUM_LINES_TRAIN_OTHER_500) if __name__ == '__main__': -- GitLab