diff --git a/README.md b/README.md old mode 100755 new mode 100644 diff --git a/cloud/README.md b/cloud/README.md old mode 100755 new mode 100644 diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh index 3c9a1c2603cc7232640e5cd48f11e113b999a0fb..378a7c6e624624af2d3fd004ff41154204a21334 100644 --- a/cloud/pcloud_submit.sh +++ b/cloud/pcloud_submit.sh @@ -1,7 +1,9 @@ -TRAIN_MANIFEST="cloud/cloud.manifest.train" -DEV_MANIFEST="cloud/cloud.manifest.dev" +#! /usr/bin/bash + +TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train" +DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev" CLOUD_MODEL_DIR="./checkpoints" -BATCH_SIZE=256 +BATCH_SIZE=512 NUM_GPU=8 NUM_NODE=1 IS_LOCAL="True" diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index 75949574d89a4210db5a4dd89dd71f2083a05ae4..d04132f900ef95539acb6cb7e77205d5108d7600 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -1,3 +1,5 @@ +#! /usr/bin/bash + TRAIN_MANIFEST=$1 DEV_MANIFEST=$2 MODEL_PATH=$3 @@ -14,11 +16,29 @@ python ./cloud/split_data.py \ --out_manifest_path='/local.manifest.dev' python -u train.py \ ---batch_size=$BATCH_SIZE \ ---use_gpu=1 \ +--batch_size=${BATCH_SIZE} \ --trainer_count=${NUM_GPU} \ ---num_threads_data=${NUM_GPU} \ +--num_passes=200 \ +--num_proc_data=${NUM_GPU} \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ --is_local=${IS_LOCAL} \ ---train_manifest_path='/local.manifest.train' \ ---dev_manifest_path='/local.manifest.dev' \ ---output_model_dir=${MODEL_PATH} 2>&1 | tee ./log/train.log +--share_rnn_weights=True \ +--train_manifest='/local.manifest.train' \ +--dev_manifest='/local.manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ +--output_model_dir=${MODEL_PATH} \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' \ +2>&1 | tee ./log/train.log diff --git a/cloud/pcloud_upload_data.sh b/cloud/pcloud_upload_data.sh index 97a0ab1818607745ba0e2e1192abb67060bf13c3..4ef235ef7da57e5e1f611ddad8b7000528ab46cc 100644 --- a/cloud/pcloud_upload_data.sh +++ b/cloud/pcloud_upload_data.sh @@ -1,5 +1,9 @@ -IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test" -OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test" +#! /usr/bin/bash + +mkdir cloud_manifests + +IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean" +OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test" CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech" NUM_SHARDS=50 @@ -14,4 +18,5 @@ then echo "Upload Data Failed!" exit 1 fi + echo "All Done." diff --git a/datasets/vocab/eng_vocab.txt b/data/librispeech/eng_vocab.txt similarity index 100% rename from datasets/vocab/eng_vocab.txt rename to data/librispeech/eng_vocab.txt diff --git a/datasets/librispeech/librispeech.py b/data/librispeech/librispeech.py similarity index 100% rename from datasets/librispeech/librispeech.py rename to data/librispeech/librispeech.py diff --git a/datasets/noise/chime3_background.py b/data/noise/chime3_background.py similarity index 100% rename from datasets/noise/chime3_background.py rename to data/noise/chime3_background.py diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py index c3de0fdbb2a40150f8cffdef3487ceb4400e52ed..536b4d6a4a6666359b90e191a3d593250b44e863 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/data_utils/augmentor/impulse_response.py @@ -4,23 +4,22 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class ImpulseResponseAugmentor(AugmentorBase): """Augmentation model for adding impulse response effect. - + :param rng: Random generator object. :type rng: random.Random :param impulse_manifest_path: Manifest path for impulse audio data. - :type impulse_manifest_path: basestring + :type impulse_manifest_path: basestring """ def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._impulse_manifest = utils.read_manifest( - manifest_path=impulse_manifest_path) + self._impulse_manifest = read_manifest(impulse_manifest_path) def transform_audio(self, audio_segment): """Add impulse response effect. diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index 281174af42c2f6d673ead94bd532941769c79c25..96e0ff4deac48063faf76338014e418e3d8ad4ad 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -4,13 +4,13 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): """Augmentation model for adding background noise. - + :param rng: Random generator object. :type rng: random.Random :param min_snr_dB: Minimal signal noise ratio, in decibels. @@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase): :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float :param noise_manifest_path: Manifest path for noise audio data. - :type noise_manifest_path: basestring + :type noise_manifest_path: basestring """ def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._noise_manifest = utils.read_manifest( - manifest_path=noise_manifest_path) + self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) def transform_audio(self, audio_segment): """Add background noise audio. diff --git a/data_utils/data.py b/data_utils/data.py index 33fcadc7bb756cc41aaf62a4d47b5f19ebdc7923..8bff6826dc51d6caaa420bec5a886e1878f36df4 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -11,7 +11,7 @@ import multiprocessing import numpy as np import paddle.v2 as paddle from threading import local -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer from data_utils.speech import SpeechSegment @@ -159,7 +159,7 @@ class DataGenerator(object): def batch_reader(): # read manifest - manifest = utils.read_manifest( + manifest = read_manifest( manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 39f453017e9a62d2740ee5e2d70cf3facfb7e040..12f8784a9921e9bd78735db3edda3898c54ee908 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import numpy as np -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment from python_speech_features import mfcc from python_speech_features import delta diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py index 1f4aae9a0913f323480c46c2d449f9515a65bb7e..7c2e05c9d85fa55c0a91386ebf9ba570b2ec0e3b 100644 --- a/data_utils/normalizer.py +++ b/data_utils/normalizer.py @@ -5,7 +5,7 @@ from __future__ import print_function import numpy as np import random -import data_utils.utils as utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment @@ -75,7 +75,7 @@ class FeatureNormalizer(object): def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """Compute mean and std from randomly sampled instances.""" - manifest = utils.read_manifest(manifest_path) + manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in sampled_manifest: diff --git a/data_utils/utils.py b/data_utils/utility.py similarity index 100% rename from data_utils/utils.py rename to data_utils/utility.py diff --git a/datasets/run_all.sh b/datasets/run_all.sh deleted file mode 100644 index ef2b721fbdc2a18fcbc208730189604e88d7ef2c..0000000000000000000000000000000000000000 --- a/datasets/run_all.sh +++ /dev/null @@ -1,13 +0,0 @@ -cd librispeech -python librispeech.py -if [ $? -ne 0 ]; then - echo "Prepare LibriSpeech failed. Terminated." - exit 1 -fi -cd - - -cat librispeech/manifest.train* | shuf > manifest.train -cat librispeech/manifest.dev-clean > manifest.dev -cat librispeech/manifest.test-clean > manifest.test - -echo "All done." diff --git a/datasets/run_noise.sh b/datasets/run_noise.sh deleted file mode 100644 index 7b27abde47a97b671609f0cd15e81565b3a00d02..0000000000000000000000000000000000000000 --- a/datasets/run_noise.sh +++ /dev/null @@ -1,10 +0,0 @@ -cd noise -python chime3_background.py -if [ $? -ne 0 ]; then - echo "Prepare CHiME3 background noise failed. Terminated." - exit 1 -fi -cd - - -cat noise/manifest.* > manifest.noise -echo "All done." diff --git a/deploy/_init_paths.py b/deploy/_init_paths.py new file mode 100644 index 0000000000000000000000000000000000000000..ddabb535be682d95c3c8b73003ea30eed06ca0b0 --- /dev/null +++ b/deploy/_init_paths.py @@ -0,0 +1,19 @@ +"""Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) + +# Add project path to PYTHONPATH +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) diff --git a/demo_client.py b/deploy/demo_client.py similarity index 100% rename from demo_client.py rename to deploy/demo_client.py diff --git a/demo_server.py b/deploy/demo_server.py similarity index 96% rename from demo_server.py rename to deploy/demo_server.py index 7cbee1fd44f517cc4d6e0602eda01163737dd93f..658b14197bee037429032bf87de70ee78a3edcab 100644 --- a/demo_server.py +++ b/deploy/demo_server.py @@ -9,10 +9,11 @@ import SocketServer import struct import wave import paddle.v2 as paddle +import _init_paths from data_utils.data import DataGenerator -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.utils import read_manifest -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -36,13 +37,13 @@ add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") add_arg('warmup_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.test-clean', "Filepath of manifest to warm up.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/evaluate.py b/evaluate.py index 1cc307dad3e611fe73cd7786976bfaca6a7c8227..747e40df872cd3f9e0844ed7ad82b2f9cfecf196 100644 --- a/evaluate.py +++ b/evaluate.py @@ -7,9 +7,9 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -30,13 +30,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('test_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.test-clean', "Filepath of manifest to evaluate.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/examples/librispeech/generate.sh b/examples/librispeech/generate.sh new file mode 100644 index 0000000000000000000000000000000000000000..a34b7bc1009e1ce32ca676898d72064d9022f2ab --- /dev/null +++ b/examples/librispeech/generate.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0 \ +python -u infer.py \ +--num_samples=10 \ +--trainer_count=1 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--infer_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/prepare_data.sh b/examples/librispeech/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..162a38c49dbda33d7db20586237931a42a9bd07d --- /dev/null +++ b/examples/librispeech/prepare_data.sh @@ -0,0 +1,32 @@ +#! /usr/bin/bash + +pushd ../.. + +# download data, generate manifests +python data/librispeech/librispeech.py \ +--manifest_prefix='data/librispeech/manifest' \ +--full_download='True' \ +--target_dir='~/.cache/paddle/dataset/speech/Libri' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/librispeech/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/librispeech/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/run_test.sh b/examples/librispeech/run_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..5a14cb682135bac8ce6097acfd07b5f2f615c1ba --- /dev/null +++ b/examples/librispeech/run_test.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u evaluate.py \ +--batch_size=128 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--test_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/run_train.sh b/examples/librispeech/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..832838a813e3995d9142dc8ebbbab185024e5c11 --- /dev/null +++ b/examples/librispeech/run_train.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u train.py \ +--batch_size=256 \ +--trainer_count=8 \ +--num_passes=200 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ +--is_local=True \ +--share_rnn_weights=True \ +--train_manifest='data/librispeech/manifest.train' \ +--dev_manifest='data/librispeech/manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' diff --git a/examples/librispeech/run_tune.sh b/examples/librispeech/run_tune.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d992e8842a3c9d434a32669d72df722a211c61b --- /dev/null +++ b/examples/librispeech/run_tune.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u tools/tune.py \ +--num_samples=100 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_alphas=14 \ +--num_betas=20 \ +--alpha_from=0.1 \ +--alpha_to=0.36 \ +--beta_from=0.05 \ +--beta_to=1.0 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--tune_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/infer.py b/infer.py index 3fd835b467f0d838efa05410be898c0a75aac24d..1ce969ae07b649a4b2d2669683b3ae537bb8edc2 100644 --- a/infer.py +++ b/infer.py @@ -7,9 +7,9 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -29,13 +29,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('infer_manifest', str, - 'datasets/manifest.dev', + 'data/librispeech/manifest.dev-clean', "Filepath of manifest to infer.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model.py b/models/model.py similarity index 98% rename from model.py rename to models/model.py index 06f69290682226dffc601711d81f45242e23538d..3e6fc328abd747bd8f8ce164b74dbf5745fa949a 100644 --- a/model.py +++ b/models/model.py @@ -7,10 +7,10 @@ import sys import os import time import gzip -from decoder import * -from lm.lm_scorer import LmScorer import paddle.v2 as paddle -from layer import * +from utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder +from lm.lm_scorer import LmScorer +from models.network import deep_speech_v2_network class DeepSpeech2Model(object): @@ -241,7 +241,7 @@ class DeepSpeech2Model(object): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) - self._log_probs, self._loss = deep_speech2( + self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, dict_size=vocab_size, diff --git a/layer.py b/models/network.py similarity index 95% rename from layer.py rename to models/network.py index b7ac3c23e3c7bb91c9b2e616e7c42b7f87ca244f..13ba5d2c927116150ec15f5604b9576ee90d4200 100644 --- a/layer.py +++ b/models/network.py @@ -1,4 +1,4 @@ -"""Contains DeepSpeech2 layers.""" +"""Contains DeepSpeech2 layers and networks.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -205,16 +205,15 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): return output -def deep_speech2(audio_data, - text_data, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256, - use_gru=False, - share_rnn_weights=True): - """ - The whole DeepSpeech2 model structure. +def deep_speech_v2_network(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256, + use_gru=False, + share_rnn_weights=True): + """The DeepSpeech2 network structure. :param audio_data: Audio spectrogram data layer. :type audio_data: LayerOutput diff --git a/tools/build_vocab.py b/tools/build_vocab.py index ac600302679320f2fcfbee7645ad83c2442b47d5..6fbb9bdfc2b9fe10cc7fdc2e642172f82bacd824 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -13,8 +13,8 @@ import json from collections import Counter import os.path import _init_paths -from data_utils import utils -from utils import add_arguments, print_arguments +from data_utils.utility import read_manifest +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tools/compute_mean_std.py b/tools/compute_mean_std.py index 9f7bf06cedf532458d1d704f4099a4f23e931be5..5bb6be39dbc8c9b5d9b14a54b619fb4b86512bd6 100644 --- a/tools/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -9,7 +9,7 @@ import _init_paths from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tune.py b/tools/tune.py similarity index 93% rename from tune.py rename to tools/tune.py index eab00cfdb3ff54725767373df6a84ff4e4bc505e..7a23791092cd73b5bc7ca10997f586040cfc33a0 100644 --- a/tune.py +++ b/tools/tune.py @@ -7,10 +7,11 @@ import numpy as np import argparse import functools import paddle.v2 as paddle +import _init_paths from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -27,20 +28,20 @@ add_arg('num_betas', int, 20, "# of beta candidates for tuning.") add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") -add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('beta_to', float, 1.0, "Where beta ends tuning with.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('tune_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.dev', "Filepath of manifest to tune.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', diff --git a/train.py b/train.py index 7cef7539b35b805030976303ea901e6d8081386e..4a7a0eda2f49eb08e55a323830c9f4bc27de0ea2 100644 --- a/train.py +++ b/train.py @@ -6,9 +6,9 @@ from __future__ import print_function import argparse import functools import paddle.v2 as paddle -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.data import DataGenerator -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -27,21 +27,21 @@ add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('is_local', bool, True, "Use pserver or not.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('is_local', bool, True, "Use pserver or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('train_manifest', str, - 'datasets/manifest.train', + 'data/librispeech/manifest.train', "Filepath of train manifest.") add_arg('dev_manifest', str, - 'datasets/manifest.dev', + 'data/librispeech/manifest.dev-clean', "Filepath of validation manifest.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('init_model_path', str, None, @@ -101,7 +101,7 @@ def train(): rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.init_model_path, - share_rnn_weights=args.share_weights) + share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/decoder.py b/utils/decoder.py similarity index 100% rename from decoder.py rename to utils/decoder.py diff --git a/error_rate.py b/utils/error_rate.py similarity index 100% rename from error_rate.py rename to utils/error_rate.py diff --git a/utils.py b/utils/utility.py similarity index 100% rename from utils.py rename to utils/utility.py