diff --git a/decoder.py b/decoder.py index 8f2e0508de79fea30ebc30230e948b15923bdf24..61ead25c8d46f8a362b8d72d88dd80aac5824088 100644 --- a/decoder.py +++ b/decoder.py @@ -9,8 +9,9 @@ from math import log import multiprocessing -def ctc_best_path_decoder(probs_seq, vocabulary): - """Best path decoder, also called argmax decoder or greedy decoder. +def ctc_greedy_decoder(probs_seq, vocabulary): + """CTC greedy (best path) decoder. + Path consisting of the most probable tokens are further post-processed to remove consecutive repetitions and all blanks. @@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq, cutoff_prob=1.0, ext_scoring_func=None, nproc=False): - """Beam search decoder for CTC-trained network. It utilizes beam search - to approximately select top best decoding labels and returning results - in the descending order. The implementation is based on Prefix - Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is + """CTC Beam search decoder. + + It utilizes beam search to approximately select top best decoding + labels and returning results in the descending order. + The implementation is based on Prefix Beam Search + (https://arxiv.org/abs/1408.2873), and the unclear part is redesigned. Two important modifications: 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 2) the if condition "if l^+ not diff --git a/demo_server.py b/demo_server.py index b000e35e91c20ec925fe1cd52a3901ed7ee9519f..d2afa49b4d83cb9e0a6d7a90ad53661f292f9ff1 100644 --- a/demo_server.py +++ b/demo_server.py @@ -9,118 +9,74 @@ import SocketServer import struct import wave import paddle.v2 as paddle -from utils import print_arguments from data_utils.data import DataGenerator from model import DeepSpeech2Model from data_utils.utils import read_manifest parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--host_ip", - default="localhost", - type=str, - help="Server IP address. (default: %(default)s)") -parser.add_argument( - "--host_port", - default=8086, - type=int, - help="Server Port. (default: %(default)s)") -parser.add_argument( - "--speech_save_dir", - default="demo_cache", - type=str, - help="Directory for saving demo speech. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--warmup_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for warmup test. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=100, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('host_port', int, 8086, "Server's IP port.") +add_arg('host_ip', str, + 'localhost', + "Server's IP address.") +add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('warmup_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to warm up.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable class AsrTCPServer(SocketServer.TCPServer): @@ -200,8 +156,8 @@ def start_server(): """Start the ASR server""" # prepare data generator data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) @@ -212,7 +168,7 @@ def start_server(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler @@ -220,13 +176,13 @@ def start_server(): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, + language_model_path=args.lang_model_path, num_processes=1) return result_transcript[0] @@ -235,7 +191,7 @@ def start_server(): print('Warming up ...') warm_up_test( audio_process_handler=file_to_transcript, - manifest_path=args.warmup_manifest_path, + manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') @@ -249,6 +205,13 @@ def start_server(): server.serve_forever() +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=1) diff --git a/evaluate.py b/evaluate.py index 8dd169b6c2a41a1ad749324e6cba60bff98d951b..1adf42557619c9499366ba093aa1eb49e488e894 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,140 +10,83 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", - default=128, - type=int, - help="Minibatch size for evaluation. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding, best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('batch_size', int, 128, "Minibatch size.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('test_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to evaluate.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.test_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, @@ -155,7 +98,7 @@ def evaluate(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) error_rate_func = cer if args.error_rate_type == 'cer' else wer @@ -163,14 +106,14 @@ def evaluate(): for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data @@ -184,8 +127,15 @@ def evaluate(): (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) evaluate() diff --git a/infer.py b/infer.py index 0c52ffc831b3349dacc5453bc21dc9a13e6471c8..cf02808c1beba99242446c4420da3d57ef2a797d 100644 --- a/infer.py +++ b/infer.py @@ -10,140 +10,82 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=10, - type=int, - help="Number of samples for inference. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('num_samples', int, 10, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('infer_manifest', str, + 'datasets/manifest.dev', + "Filepath of manifest to infer.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def infer(): """Inference for DeepSpeech2.""" data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=1) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.infer_manifest, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, @@ -156,18 +98,18 @@ def infer(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ @@ -181,8 +123,15 @@ def infer(): (args.error_rate_type, error_rate_func(target, result))) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) infer() diff --git a/model.py b/model.py index 0234ed2d4c901f36ebfc16b317f5355cd57796e0..894605bfd85d7bdc116332c03e9fa5645cc32726 100644 --- a/model.py +++ b/model.py @@ -146,7 +146,7 @@ class DeepSpeech2Model(object): # run inference return self._loss_inferer.infer(input=infer_data) - def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, + def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): """Model inference. Infer the transcription for a batch of speech @@ -156,9 +156,9 @@ class DeepSpeech2Model(object): consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list - :param decode_method: Decoding method name, 'best_path' or - 'beam search'. - :param decode_method: string + :param decoder_method: Decoding method name, 'ctc_greedy' or + 'ctc_beam_search'. + :param decoder_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. @@ -190,13 +190,13 @@ class DeepSpeech2Model(object): ] # run decoder results = [] - if decode_method == "best_path": + if decoder_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): - output_transcription = ctc_best_path_decoder( + output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) - elif decode_method == "beam_search": + elif decoder_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._ext_scorer = LmScorer(beam_alpha, beam_beta, @@ -205,7 +205,6 @@ class DeepSpeech2Model(object): else: self._ext_scorer.reset_params(beam_alpha, beam_beta) assert self._loaded_lm_path == language_model_path - # beam search decode beam_search_results = ctc_beam_search_decoder_batch( probs_split=probs_split, @@ -218,8 +217,8 @@ class DeepSpeech2Model(object): results = [result[0][1] for result in beam_search_results] else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) + raise ValueError("Decoder method [%s] is not supported." % + decoder_method) return results def _create_parameters(self, model_path=None): diff --git a/train.py b/train.py index d055341f10c82f3cec38867e2db36cfaaabe0a79..d21e6a3bdba6852f218aca23cd28f5bd3f0d7f58 100644 --- a/train.py +++ b/train.py @@ -9,169 +9,103 @@ import multiprocessing import paddle.v2 as paddle from model import DeepSpeech2Model from data_utils.data import DataGenerator -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", default=256, type=int, help="Minibatch size.") -parser.add_argument( - "--num_passes", - default=200, - type=int, - help="Training pass number. (default: %(default)s)") -parser.add_argument( - "--num_iterations_print", - default=100, - type=int, - help="Number of iterations for every train cost printing. " - "(default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--adam_learning_rate", - default=5e-4, - type=float, - help="Learning rate for ADAM Optimizer. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--use_sortagrad", - default=True, - type=distutils.util.strtobool, - help="Use sortagrad or not. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--max_duration", - default=27.0, - type=float, - help="Audios with duration larger than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--min_duration", - default=0.0, - type=float, - help="Audios with duration smaller than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--shuffle_method", - default='batch_shuffle_clipped', - type=str, - help="Shuffle method: 'instance_shuffle', 'batch_shuffle', " - "'batch_shuffle_batch'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--train_manifest_path", - default='datasets/manifest.train', - type=str, - help="Manifest path for training. (default: %(default)s)") -parser.add_argument( - "--dev_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for validation. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--init_model_path", - default=None, - type=str, - help="If set None, the training will start from scratch. " - "Otherwise, the training will resume from " - "the existing model of this path. (default: %(default)s)") -parser.add_argument( - "--output_model_dir", - default="./checkpoints", - type=str, - help="Directory for saving models. (default: %(default)s)") -parser.add_argument( - "--augmentation_config", - default=open('conf/augmentation.config', 'r').read(), - type=str, - help="Augmentation configuration in json-format. " - "(default: %(default)s)") -parser.add_argument( - "--is_local", - default=True, - type=distutils.util.strtobool, - help="Set to false if running with pserver in paddlecloud. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of optimization +add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('learning_rate', float, 5e-4, "Learning rate.") +add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('num_passes', int, 200, "# of training epochs.") +add_arg('is_local', bool, True, "Use pserver or not.") +add_arg('num_iter_print', int, 100, "Every # iterations for printing " + "train cost.") +# configurations of data preprocess +add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") +add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('augment_conf_path',str, + 'conf/augmentation.config', + "Filepath of augmentation configuration file (json-format).") +add_arg('shuffle_method', str, + 'batch_shuffle_clipped', + "Shuffle method.", + choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('train_manifest', str, + 'datasets/manifest.train', + "Filepath of train manifest.") +add_arg('dev_manifest', str, + 'datasets/manifest.dev', + "Filepath of validation manifest.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('init_model_path', str, + None, + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('output_model_dir', str, + "./checkpoints", + "Directory for saving checkpoints.") args = parser.parse_args() +# yapf: disable def train(): """DeepSpeech2 training.""" train_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config=open(args.augment_conf_path, 'r').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) dev_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) train_batch_reader = train_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, + manifest_path=args.train_manifest, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator( - manifest_path=args.dev_manifest_path, + manifest_path=args.dev_manifest, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, @@ -184,21 +118,28 @@ def train(): rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.init_model_path, - share_rnn_weights=args.share_rnn_weights) + share_rnn_weights=args.share_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, feeding_dict=train_generator.feeding, - learning_rate=args.adam_learning_rate, + learning_rate=args.learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print, + num_iterations_print=args.num_iter_print, output_model_dir=args.output_model_dir, is_local=args.is_local) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train() diff --git a/tune.py b/tune.py index d8001339eef1f51bb221238a647b2c4857a790d2..eac7ccd305f12e9b7a3404ebb29d538596298181 100644 --- a/tune.py +++ b/tune.py @@ -1,4 +1,4 @@ -"""Parameters tuning for DeepSpeech2 model.""" +"""Beam search parameters tuning for DeepSpeech2 model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -11,134 +11,71 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=100, - type=int, - help="Number of samples for parameters tuning. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--tune_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for tuning. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha_from", - default=0.1, - type=float, - help="Where alpha starts from. (default: %(default)f)") -parser.add_argument( - "--num_alphas", - default=14, - type=int, - help="Number of candidate alphas. (default: %(default)d)") -parser.add_argument( - "--alpha_to", - default=0.36, - type=float, - help="Where alpha ends with. (default: %(default)f)") -parser.add_argument( - "--beta_from", - default=0.05, - type=float, - help="Where beta starts from. (default: %(default)f)") -parser.add_argument( - "--num_betas", - default=20, - type=float, - help="Number of candidate betas. (default: %(default)d)") -parser.add_argument( - "--beta_to", - default=1.0, - type=float, - help="Where beta ends with. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('num_samples', int, 100, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of tuning parameters +add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") +add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") +add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") +add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") +add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('num_betas', int, 20, "# of beta candidates for tuning.") +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('tune_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to tune.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def tune(): @@ -149,13 +86,13 @@ def tune(): raise ValueError("num_betas must be non-negative!") data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=1) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.tune_manifest_path, + manifest_path=args.tune_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) @@ -171,7 +108,7 @@ def tune(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # create grid for search @@ -184,14 +121,14 @@ def tune(): for alpha, beta in params_grid: result_transcripts = ds2_model.infer_batch( infer_data=tune_data, - decode_method='beam_search', + decoder_method='ctc_beam_search', beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) wer_sum, num_ins = 0.0, 0 for target, result in zip(target_transcripts, result_transcripts): wer_sum += wer(target, result) @@ -200,8 +137,15 @@ def tune(): (alpha, beta, wer_sum / num_ins)) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) tune() diff --git a/utils.py b/utils.py deleted file mode 100644 index 1d51e2042397b4d3010259a8a3174bc969968aec..0000000000000000000000000000000000000000 --- a/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Contains common utility functions.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -def print_arguments(args): - """Print argparse's arguments. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - print("----- Configuration Arguments -----") - for arg, value in vars(args).iteritems(): - print("%s: %s" % (arg, value)) - print("------------------------------------")