提交 b05d5ea5 编写于 作者: X Xinghai Sun

Remove manifest's line number check from librispeech.py and update README.md.

上级 d784c538
...@@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all ...@@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all
cd .. cd ..
``` ```
After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format.
By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets.
More help for arguments: More help for arguments:
``` ```
......
""" """
Download, unpack and create manifest file for the Librespeech dataset. Download, unpack and create manifest json files for the Librespeech dataset.
A manifest file is a dataset summarization, with each line a json format A manifest is a json file summarizing filelist in a data set, with each line
string containing meta data for one audio clip, including its filepath, containing the meta data (i.e. audio filepath, transcription text, audio
transcription string, and duration. It serves as a unified interface for duration) of each audio file in the data set.
different data sets.
""" """
import paddle.v2 as paddle import paddle.v2 as paddle
...@@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" ...@@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
NUM_LINES_TEST_CLEAN = 2620
NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.') description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument( parser.add_argument(
...@@ -95,12 +86,9 @@ def unpack(filepath, target_dir): ...@@ -95,12 +86,9 @@ def unpack(filepath, target_dir):
def create_manifest(data_dir, manifest_path): def create_manifest(data_dir, manifest_path):
""" """
Create a manifest file summarizing the dataset (list of filepath and meta Create a manifest json file summarizing the data set, with each line
data). containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set.
Each line of the manifest contains one audio clip filepath, its
transcription text string, and its duration. Manifest file servers as a
unified interfance to organize data sets.
""" """
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
...@@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path): ...@@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path):
out_file.write(line + '\n') out_file.write(line + '\n')
def verify_file_line_number(filepath, num_lines): def prepare_dataset(url, md5sum, target_dir, manifest_path):
with open(filepath, 'r') as file:
return len(file.readlines()) == num_lines
def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines):
""" """
Download, unpack and create summmary manifest file. Download, unpack and create summmary manifest file.
""" """
# download
filepath = download(url, md5sum, target_dir)
# unpack
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
# download
filepath = download(url, md5sum, target_dir)
# unpack
unpack(filepath, target_dir) unpack(filepath, target_dir)
else: else:
print("Unpacked data exists, skip unpacking.") print("Skip downloading and unpacking. Data already exists in %s." %
# create manifest and verify line number target_dir)
# create manifest json file
create_manifest(target_dir, manifest_path) create_manifest(target_dir, manifest_path)
if not verify_file_line_number(manifest_path, num_lines):
raise RuntimeError("Manifest line number check failed. "
"Please remove directory and try running the script "
"again.")
def main(): def main():
...@@ -157,45 +137,38 @@ def main(): ...@@ -157,45 +137,38 @@ def main():
url=URL_TEST_CLEAN, url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN, md5sum=MD5_TEST_CLEAN,
target_dir=os.path.join(args.target_dir, "test-clean"), target_dir=os.path.join(args.target_dir, "test-clean"),
manifest_path=args.manifest_prefix + ".test-clean", manifest_path=args.manifest_prefix + ".test-clean")
num_lines=NUM_LINES_TEST_CLEAN)
prepare_dataset( prepare_dataset(
url=URL_DEV_CLEAN, url=URL_DEV_CLEAN,
md5sum=MD5_DEV_CLEAN, md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean", manifest_path=args.manifest_prefix + ".dev-clean")
num_lines=NUM_LINES_DEV_CLEAN)
prepare_dataset( prepare_dataset(
url=URL_TRAIN_CLEAN_100, url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"), target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100", manifest_path=args.manifest_prefix + ".train-clean-100")
num_lines=NUM_LINES_TRAIN_CLEAN_100)
if args.full_download: if args.full_download:
prepare_dataset( prepare_dataset(
url=URL_TEST_OTHER, url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER, md5sum=MD5_TEST_OTHER,
target_dir=os.path.join(args.target_dir, "test-other"), target_dir=os.path.join(args.target_dir, "test-other"),
manifest_path=args.manifest_prefix + ".test-other", manifest_path=args.manifest_prefix + ".test-other")
num_lines=NUM_LINES_TEST_OTHER)
prepare_dataset( prepare_dataset(
url=URL_DEV_OTHER, url=URL_DEV_OTHER,
md5sum=MD5_DEV_OTHER, md5sum=MD5_DEV_OTHER,
target_dir=os.path.join(args.target_dir, "dev-other"), target_dir=os.path.join(args.target_dir, "dev-other"),
manifest_path=args.manifest_prefix + ".dev-other", manifest_path=args.manifest_prefix + ".dev-other")
num_lines=NUM_LINES_DEV_OTHER)
prepare_dataset( prepare_dataset(
url=URL_TRAIN_CLEAN_360, url=URL_TRAIN_CLEAN_360,
md5sum=MD5_TRAIN_CLEAN_360, md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"), target_dir=os.path.join(args.target_dir, "train-clean-360"),
manifest_path=args.manifest_prefix + ".train-clean-360", manifest_path=args.manifest_prefix + ".train-clean-360")
num_lines=NUM_LINES_TRAIN_CLEAN_360)
prepare_dataset( prepare_dataset(
url=URL_TRAIN_OTHER_500, url=URL_TRAIN_OTHER_500,
md5sum=MD5_TRAIN_OTHER_500, md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"), target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500", manifest_path=args.manifest_prefix + ".train-other-500")
num_lines=NUM_LINES_TRAIN_OTHER_500)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册