diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index 0cdb203d21ef5fa854a011f2f0381078cabcb874..90b339502851b049f299e8b4dff8f62dfeb60592 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -16,12 +16,10 @@ For some machines, we also need to install libsndfile1. Details to be added. ### Preparing Data ``` -cd datasets -sh run_all.sh -cd .. +sh datasets/run_all.sh ``` -`sh run_all.sh` prepares all ASR datasets (currently, only LibriSpeech available). After running, we have several summarization manifest files in json-format. +`sh datasets/run_all.sh` prepares all ASR datasets (currently, only LibriSpeech and THCHS30 available). After running, we have several summarization manifest files in json-format. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcript text, audio duration) of each audio file within the data set, in json format. Manifest file serves as an interface informing our system of where and what to read the speech samples. diff --git a/deep_speech_2/datasets/__init__.py b/deep_speech_2/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/deep_speech_2/datasets/common.py b/deep_speech_2/datasets/common.py new file mode 100644 index 0000000000000000000000000000000000000000..48bf52fba3f2bc964bff53737fee632078648c67 --- /dev/null +++ b/deep_speech_2/datasets/common.py @@ -0,0 +1,34 @@ +"""Provide some common utility functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tarfile +import wget +from paddle.v2.dataset.common import md5file + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath) diff --git a/deep_speech_2/datasets/librispeech/librispeech.py b/deep_speech_2/datasets/librispeech/librispeech.py index 87e52ae4aa286503d79f1326065831acfe6bf985..270cc622d6df474b686e2432c752fc4f38ccb448 100644 --- a/deep_speech_2/datasets/librispeech/librispeech.py +++ b/deep_speech_2/datasets/librispeech/librispeech.py @@ -11,12 +11,10 @@ from __future__ import print_function import distutils.util import os -import wget -import tarfile import argparse import soundfile import json -from paddle.v2.dataset.common import md5file +from datasets.common import download, unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') @@ -45,7 +43,7 @@ parser.add_argument( help="Directory to save the dataset. (default: %(default)s)") parser.add_argument( "--manifest_prefix", - default="manifest", + default="manifest-libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") parser.add_argument( @@ -58,33 +56,6 @@ parser.add_argument( args = parser.parse_args() -def download(url, md5sum, target_dir): - """ - Download file from url to target_dir, and check md5sum. - """ - if not os.path.exists(target_dir): os.makedirs(target_dir) - filepath = os.path.join(target_dir, url.split("/")[-1]) - if not (os.path.exists(filepath) and md5file(filepath) == md5sum): - print("Downloading %s ..." % url) - wget.download(url, target_dir) - print("\nMD5 Chesksum %s ..." % filepath) - if not md5file(filepath) == md5sum: - raise RuntimeError("MD5 checksum failed.") - else: - print("File exists, skip downloading. (%s)" % filepath) - return filepath - - -def unpack(filepath, target_dir): - """ - Unpack the file to the target_dir. - """ - print("Unpacking %s ..." % filepath) - tar = tarfile.open(filepath) - tar.extractall(target_dir) - tar.close() - - def create_manifest(data_dir, manifest_path): """ Create a manifest json file summarizing the data set, with each line diff --git a/deep_speech_2/datasets/run_all.sh b/deep_speech_2/datasets/run_all.sh index ef2b721fbdc2a18fcbc208730189604e88d7ef2c..d6dc418e4b5549bd2efa6aeea502cc9adb80ac5a 100644 --- a/deep_speech_2/datasets/run_all.sh +++ b/deep_speech_2/datasets/run_all.sh @@ -1,13 +1,22 @@ -cd librispeech -python librispeech.py +export PYTHONPATH=`pwd`:$PYTHONPATH +cd datasets + +python thchs30/thchs30.py +if [ $? -ne 0 ]; then + echo "Prepare LHCHS30 failed. Terminated." + exit 1 +fi + +python librispeech/librispeech.py if [ $? -ne 0 ]; then echo "Prepare LibriSpeech failed. Terminated." exit 1 fi -cd - cat librispeech/manifest.train* | shuf > manifest.train cat librispeech/manifest.dev-clean > manifest.dev cat librispeech/manifest.test-clean > manifest.test echo "All done." + +cd - diff --git a/deep_speech_2/datasets/thchs30/thchs30.py b/deep_speech_2/datasets/thchs30/thchs30.py new file mode 100644 index 0000000000000000000000000000000000000000..8538862aac303dc3e73c3bb74d502d2c7e4b07cc --- /dev/null +++ b/deep_speech_2/datasets/thchs30/thchs30.py @@ -0,0 +1,133 @@ +"""Prepare THCHS-30 Chinese Speech Corpus. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import os +import argparse +import soundfile +import json +from datasets.common import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = "http://www.openslr.org/resources/18" +URL_CLEAN_DATA = URL_ROOT + "/data_thchs30.tgz" +URL_0DB_NOISY_TEST_DATA = URL_ROOT + "/test-noise.tgz" + +MD5_CLEAN_DATA = "2d2252bde5c8429929e1841d4cb95e90" +MD5_0DB_NOISY_TEST_DATA = "7e8a985fb965b84141b68c68556c2030" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/THCHS30", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest-thchs30", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--download_0db_noise_test", + default="True", + type=distutils.util.strtobool, + help="Whether to download 0db noisy test dataset." + " If True, download 0Db noise mixed test data. (default: %(default)s)") +parser.add_argument( + "--remove_tar", + default="True", + type=distutils.util.strtobool, + help="If True, remove tar file after unpacking automatically." + " (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(transcript_data_dir, audio_data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + filelist = sorted(os.walk(audio_data_dir))[0][2] + audio_filelist = [fname for fname in filelist if fname.endswith('.wav')] + for audio_file in audio_filelist: + transcript_file_path = os.path.join(transcript_data_dir, + audio_file + '.trn') + if not os.path.isfile(transcript_file_path): + raise IOError("Transcript file %s not exists." % \ + transcript_file_path) + transcript_text = open(transcript_file_path).readline().strip() + transcript_text = ''.join(transcript_text.split(' ')) + audio_file_path = os.path.join(audio_data_dir, audio_file) + audio_data, samplerate = soundfile.read(audio_file_path) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_file_path, + 'duration': duration, + 'text': transcript_text + }, + ensure_ascii=False)) + with open(manifest_path, 'w') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(target_dir, manifest_prefix, download_noisy, rm_tar): + def download_unpack(url, md5sum, download_dir, unpack_dir, rm_tar): + if not os.path.exists(unpack_dir): + filepath = download(url, md5sum, download_dir) + unpack(filepath, unpack_dir, rm_tar) + else: + print("Skip downloading and unpacking. Data already exists in %s" % + unpack_dir) + + clean_dir = os.path.join(target_dir, "Clean") + download_unpack(URL_CLEAN_DATA, MD5_CLEAN_DATA, target_dir, clean_dir, + rm_tar) + # create [train-clean|dev-clean|test-clean] manifest file + base_dir = os.path.join(clean_dir, 'data_thchs30') + transcript_data_dir = os.path.join(base_dir, 'data') + for data_type in ['train', 'dev', 'test']: + manifest_path = manifest_prefix + '.' + data_type + '-clean' + audio_data_dir = os.path.join(base_dir, data_type) + create_manifest(transcript_data_dir, audio_data_dir, manifest_path) + + if download_noisy == True: + # create test-0db-noise-[cafe|car|white] manifest file + noisy_test_dir = os.path.join(target_dir, "0DB-Noisy-Test") + download_unpack(URL_0DB_NOISY_TEST_DATA, MD5_0DB_NOISY_TEST_DATA, + target_dir, noisy_test_dir, rm_tar) + noisy_base_dir = os.path.join(noisy_test_dir, 'test-noise', '0db') + for data_type in ['cafe', 'car', 'white']: + manifest_path = manifest_prefix + '.' + \ + 'test-0db-noise-' + data_type + audio_data_dir = os.path.join(noisy_base_dir, data_type) + create_manifest(transcript_data_dir, audio_data_dir, manifest_path) + + +def main(): + target_dir = args.target_dir + manifest_prefix = args.manifest_prefix + download_noisy = False + if args.download_0db_noise_test == True: + download_noisy = True + rm_tar = False + if args.remove_tar == True: + rm_tar = True + prepare_dataset(target_dir, manifest_prefix, download_noisy, rm_tar) + + +if __name__ == '__main__': + main()