diff --git a/data/librispeech/librispeech.py b/data/librispeech/librispeech.py index d963a7d5372d64f3abb1dcbdd16dbdafc1888de0..14a3804e2eb8e1a114ad9b4cbb56a5fc1061677e 100644 --- a/data/librispeech/librispeech.py +++ b/data/librispeech/librispeech.py @@ -41,7 +41,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--target_dir", - default=DATA_HOME + "/Libri", + default=DATA_HOME + "/libri", type=str, help="Directory to save the dataset. (default: %(default)s)") parser.add_argument( diff --git a/data/tiny/tiny.py b/data/tiny/tiny.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba2a13c577660cad22be0e594fa42bb894330d8 --- /dev/null +++ b/data/tiny/tiny.py @@ -0,0 +1,126 @@ +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import os +import sys +import tarfile +import argparse +import soundfile +import json +import codecs +from paddle.v2.dataset.common import md5file + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = "http://www.openslr.org/resources/12" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" +MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/tiny", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download(url, md5sum, target_dir): + """ + Download file from url to target_dir, and check md5sum. + """ + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir): + """ + Unpack the file to the target_dir. + """ + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + + +def create_manifest(data_dir, manifest_path): + """ + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(data_dir, subfolder, text_filelist[0]) + for line in open(text_filepath): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(data_dir, subfolder, + segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """ + Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + + +if __name__ == '__main__': + main() diff --git a/examples/librispeech/prepare_data.sh b/examples/librispeech/prepare_data.sh index a18402ea3815aa1f41d81c48534ea0562aaa9b35..6e99977038b0cb5b99e11891c475f4b1202f2b24 100644 --- a/examples/librispeech/prepare_data.sh +++ b/examples/librispeech/prepare_data.sh @@ -16,7 +16,7 @@ fi cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train -# build vocabulary (for English data, we can just skip this) +# build vocabulary (can be skipped for English, as already provided) # python tools/build_vocab.py \ # --count_threshold=0 \ # --vocab_path='data/librispeech/eng_vocab.txt' \ diff --git a/examples/librispeech_tiny/prepare_data.sh b/examples/librispeech_tiny/prepare_data.sh deleted file mode 100644 index a18402ea3815aa1f41d81c48534ea0562aaa9b35..0000000000000000000000000000000000000000 --- a/examples/librispeech_tiny/prepare_data.sh +++ /dev/null @@ -1,39 +0,0 @@ -#! /usr/bin/bash - -pushd ../.. - -# download data, generate manifests -python data/librispeech/librispeech.py \ ---manifest_prefix='data/librispeech/manifest' \ ---full_download='True' \ ---target_dir='~/.cache/paddle/dataset/speech/Libri' - -if [ $? -ne 0 ]; then - echo "Prepare LibriSpeech failed. Terminated." - exit 1 -fi - -cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train - - -# build vocabulary (for English data, we can just skip this) -# python tools/build_vocab.py \ -# --count_threshold=0 \ -# --vocab_path='data/librispeech/eng_vocab.txt' \ -# --manifest_paths='data/librispeech/manifeset.train' - - -# compute mean and stddev for normalizer -python tools/compute_mean_std.py \ ---manifest_path='data/librispeech/manifest.train' \ ---num_samples=2000 \ ---specgram_type='linear' \ ---output_path='data/librispeech/mean_std.npz' - -if [ $? -ne 0 ]; then - echo "Compute mean and stddev failed. Terminated." - exit 1 -fi - - -echo "LibriSpeech Data preparation done." diff --git a/examples/tiny/run_data.sh b/examples/tiny/run_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..44345d8cc5be8739e4e00493259f87e9492f3c1c --- /dev/null +++ b/examples/tiny/run_data.sh @@ -0,0 +1,45 @@ +#! /usr/bin/bash + +pushd ../.. + +# download data, generate manifests +python data/tiny/tiny.py \ +--manifest_prefix='data/tiny/manifest' \ +--target_dir=$HOME'/.cache/paddle/dataset/speech/tiny' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +cat data/tiny/manifest.dev-clean | head -n 32 > data/tiny/manifest.train +cat data/tiny/manifest.dev-clean | head -n 48 | tail -n 16 > data/tiny/manifest.dev +cat data/tiny/manifest.dev-clean | head -n 64 | tail -n 16 > data/tiny/manifest.test + + +# build vocabulary +python tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/tiny/vocab.txt' \ +--manifest_paths='data/tiny/manifest.train' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/tiny/manifest.train' \ +--num_samples=32 \ +--specgram_type='linear' \ +--output_path='data/tiny/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "Tiny data preparation done." diff --git a/examples/librispeech_tiny/run_infer.sh b/examples/tiny/run_infer.sh similarity index 58% rename from examples/librispeech_tiny/run_infer.sh rename to examples/tiny/run_infer.sh index 619d546e8829c99d7fef62cd427a15d4dd246d13..f09bc66384c946f304bd4ec0277d5450c705df49 100644 --- a/examples/librispeech_tiny/run_infer.sh +++ b/examples/tiny/run_infer.sh @@ -4,7 +4,7 @@ pushd ../.. CUDA_VISIBLE_DEVICES=0 \ python -u infer.py \ ---num_samples=10 \ +--num_samples=4 \ --trainer_count=1 \ --beam_size=500 \ --num_proc_bsearch=12 \ @@ -17,11 +17,11 @@ python -u infer.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---infer_manifest='data/librispeech/manifest.dev-clean' \ ---mean_std_path='data/librispeech/mean_std.npz' \ ---vocab_path='data/librispeech/eng_vocab.txt' \ ---model_path='checkpoints/params.latest.tar.gz' \ ---lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--infer_manifest='data/tiny/manifest.train' \ +--mean_std_path='data/tiny/mean_std.npz' \ +--vocab_path='data/tiny/vocab.txt' \ +--model_path='checkpoints/params.pass-14.tar.gz' \ +--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ --decoding_method='ctc_beam_search' \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/librispeech_tiny/run_test.sh b/examples/tiny/run_test.sh similarity index 100% rename from examples/librispeech_tiny/run_test.sh rename to examples/tiny/run_test.sh diff --git a/examples/librispeech_tiny/run_train.sh b/examples/tiny/run_train.sh similarity index 56% rename from examples/librispeech_tiny/run_train.sh rename to examples/tiny/run_train.sh index 14672167c3cdf8cf0dfa95d7495f251711be2ba5..7ca3368761568916003fcfe3f4974462c8ecb62d 100644 --- a/examples/librispeech_tiny/run_train.sh +++ b/examples/tiny/run_train.sh @@ -2,17 +2,17 @@ pushd ../.. -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +CUDA_VISIBLE_DEVICES=0,1 \ python -u train.py \ ---batch_size=256 \ ---trainer_count=8 \ ---num_passes=50 \ ---num_proc_data=12 \ +--batch_size=2 \ +--trainer_count=1 \ +--num_passes=10 \ +--num_proc_data=1 \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=2048 \ --num_iter_print=100 \ ---learning_rate=5e-4 \ +--learning_rate=5e-5 \ --max_duration=27.0 \ --min_duration=0.0 \ --use_sortagrad=True \ @@ -20,10 +20,10 @@ python -u train.py \ --use_gpu=True \ --is_local=True \ --share_rnn_weights=True \ ---train_manifest='data/librispeech/manifest.train' \ ---dev_manifest='data/librispeech/manifest.dev' \ ---mean_std_path='data/librispeech/mean_std.npz' \ ---vocab_path='data/librispeech/eng_vocab.txt' \ +--train_manifest='data/tiny/manifest.train' \ +--dev_manifest='data/tiny/manifest.train' \ +--mean_std_path='data/tiny/mean_std.npz' \ +--vocab_path='data/tiny/vocab.txt' \ --output_model_dir='./checkpoints' \ --augment_conf_path='conf/augmentation.config' \ --specgram_type='linear' \ diff --git a/examples/librispeech_tiny/run_tune.sh b/examples/tiny/run_tune.sh similarity index 100% rename from examples/librispeech_tiny/run_tune.sh rename to examples/tiny/run_tune.sh diff --git a/infer.py b/infer.py index 1ce969ae07b649a4b2d2669683b3ae537bb8edc2..73e200b496585c86c475b8820c0300a02db12861 100644 --- a/infer.py +++ b/infer.py @@ -7,7 +7,7 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from models.model import DeepSpeech2Model +from model_utils.model import DeepSpeech2Model from utils.error_rate import wer, cer from utils.utility import add_arguments, print_arguments @@ -35,10 +35,10 @@ add_arg('mean_std_path', str, 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'data/librispeech/eng_vocab.txt', + 'data/librispeech/vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', + 'model_zoo/lm/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/lm/__init__.py b/model_utils/__init__.py similarity index 100% rename from lm/__init__.py rename to model_utils/__init__.py diff --git a/models/decoder.py b/model_utils/decoder.py similarity index 99% rename from models/decoder.py rename to model_utils/decoder.py index 61ead25c8d46f8a362b8d72d88dd80aac5824088..ffba2731a06b49105f74ab2c47831105c4c68428 100644 --- a/models/decoder.py +++ b/model_utils/decoder.py @@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq, prob = prob * ext_scoring_func(result) log_prob = log(prob) beam_result.append((log_prob, result)) + else: + beam_result.append((float('-inf'), '')) ## output top beam_size decoding results beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) diff --git a/lm/lm_scorer.py b/model_utils/lm_scorer.py similarity index 100% rename from lm/lm_scorer.py rename to model_utils/lm_scorer.py diff --git a/models/model.py b/model_utils/model.py similarity index 97% rename from models/model.py rename to model_utils/model.py index 93c4c41bf761a519f7f6e70bd5221cec9773f3f9..cf146f8ce988c528b8b61127562327e11aadff6b 100644 --- a/models/model.py +++ b/model_utils/model.py @@ -8,9 +8,10 @@ import os import time import gzip import paddle.v2 as paddle -from lm.lm_scorer import LmScorer -from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder -from models.network import deep_speech_v2_network +from model_utils.lm_scorer import LmScorer +from model_utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder +from model_utils.decoder import ctc_beam_search_decoder_batch +from model_utils.network import deep_speech_v2_network class DeepSpeech2Model(object): diff --git a/models/network.py b/model_utils/network.py similarity index 100% rename from models/network.py rename to model_utils/network.py diff --git a/models/tests/test_decoders.py b/model_utils/tests/test_decoders.py similarity index 99% rename from models/tests/test_decoders.py rename to model_utils/tests/test_decoders.py index acce46af81c0168903fa57d5d756dcfd911aa15f..adf36eefc3810006b7a571f3c11771201b1a1dce 100644 --- a/models/tests/test_decoders.py +++ b/model_utils/tests/test_decoders.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import unittest -from models import decoder +from model_utils import decoder class TestDecoders(unittest.TestCase): diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/lm/run.sh b/models/lm/download_en.sh similarity index 99% rename from lm/run.sh rename to models/lm/download_en.sh index 2108ea55f1205f4c4c32b8994602544ca4e63edd..5ca33c679062483f932f371578991a66aa531054 100644 --- a/lm/run.sh +++ b/models/lm/download_en.sh @@ -14,6 +14,3 @@ if [ $MD5 != $md5_tmp ]; then echo "Fail to download the language model!" exit 1 fi - - - diff --git a/test.py b/test.py index 747e40df872cd3f9e0844ed7ad82b2f9cfecf196..791bfd58503894f8f83227409bc427f9bd065148 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from models.model import DeepSpeech2Model +from model_utils.model import DeepSpeech2Model from utils.error_rate import wer, cer from utils.utility import add_arguments, print_arguments @@ -36,14 +36,14 @@ add_arg('mean_std_path', str, 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'data/librispeech/eng_vocab.txt', + 'data/librispeech/vocab.txt', "Filepath of vocabulary.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', + 'model_zoo/lm/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") add_arg('decoding_method', str, 'ctc_beam_search', diff --git a/tools/build_vocab.py b/tools/build_vocab.py index ef9bde49f9d29470375ab5f471386c8456a85b72..e167e92adf36a75972f9695e76b29cc2adfb4f77 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -21,10 +21,8 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('vocab_path', str, - None, - "Filepath to write the vocabulary.", - nargs='+', - required=True) + 'data/librispeech/vocab.txt', + "Filepath to write the vocabulary.") add_arg('manifest_paths', str, None, "Filepaths of manifests for building vocabulary. " diff --git a/tools/tune.py b/tools/tune.py index 7a23791092cd73b5bc7ca10997f586040cfc33a0..25e495f19e391db7977e31c01b694cfd957d9d04 100644 --- a/tools/tune.py +++ b/tools/tune.py @@ -9,7 +9,7 @@ import functools import paddle.v2 as paddle import _init_paths from data_utils.data import DataGenerator -from models.model import DeepSpeech2Model +from model_utils.model import DeepSpeech2Model from utils.error_rate import wer from utils.utility import add_arguments, print_arguments @@ -41,10 +41,10 @@ add_arg('mean_std_path', str, 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'data/librispeech/eng_vocab.txt', + 'data/librispeech/vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', + 'model_zoo/lm/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/train.py b/train.py index 4a7a0eda2f49eb08e55a323830c9f4bc27de0ea2..bbf1cd7290b56a5ca1e045f1878e42560f5289ea 100644 --- a/train.py +++ b/train.py @@ -6,7 +6,7 @@ from __future__ import print_function import argparse import functools import paddle.v2 as paddle -from models.model import DeepSpeech2Model +from model_utils.model import DeepSpeech2Model from data_utils.data import DataGenerator from utils.utility import add_arguments, print_arguments @@ -41,7 +41,7 @@ add_arg('mean_std_path', str, 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'data/librispeech/eng_vocab.txt', + 'data/librispeech/vocab.txt', "Filepath of vocabulary.") add_arg('init_model_path', str, None,