diff --git a/deep_speech_2/demo_server.py b/deep_speech_2/demo_server.py index 6b73971a6b9dda5a2117cc42777a9fb4115011c8..7cbee1fd44f517cc4d6e0602eda01163737dd93f 100644 --- a/deep_speech_2/demo_server.py +++ b/deep_speech_2/demo_server.py @@ -25,7 +25,7 @@ add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -51,9 +51,9 @@ add_arg('model_path', str, add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('specgram_type', str, 'linear', @@ -160,7 +160,7 @@ def start_server(): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, diff --git a/deep_speech_2/evaluate.py b/deep_speech_2/evaluate.py index 35888f82de3df65c73d6e2009db0acdc48e975a3..1cc307dad3e611fe73cd7786976bfaca6a7c8227 100644 --- a/deep_speech_2/evaluate.py +++ b/deep_speech_2/evaluate.py @@ -17,15 +17,15 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") -add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -45,9 +45,9 @@ add_arg('model_path', str, add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('error_rate_type', str, 'wer', @@ -68,7 +68,7 @@ def evaluate(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, @@ -90,14 +90,14 @@ def evaluate(): for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 9d4bff849807496bee3c87752a949c76515f4541..3fd835b467f0d838efa05410be898c0a75aac24d 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -17,14 +17,14 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -44,9 +44,9 @@ add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('error_rate_type', str, 'wer', @@ -86,14 +86,14 @@ def infer(): share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ diff --git a/deep_speech_2/model.py b/deep_speech_2/model.py index 894605bfd85d7bdc116332c03e9fa5645cc32726..06f69290682226dffc601711d81f45242e23538d 100644 --- a/deep_speech_2/model.py +++ b/deep_speech_2/model.py @@ -146,7 +146,7 @@ class DeepSpeech2Model(object): # run inference return self._loss_inferer.infer(input=infer_data) - def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta, + def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): """Model inference. Infer the transcription for a batch of speech @@ -156,9 +156,9 @@ class DeepSpeech2Model(object): consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list - :param decoder_method: Decoding method name, 'ctc_greedy' or - 'ctc_beam_search'. - :param decoder_method: string + :param decoding_method: Decoding method name, 'ctc_greedy' or + 'ctc_beam_search'. + :param decoding_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. @@ -190,13 +190,13 @@ class DeepSpeech2Model(object): ] # run decoder results = [] - if decoder_method == "ctc_greedy": + if decoding_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) - elif decoder_method == "ctc_beam_search": + elif decoding_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._ext_scorer = LmScorer(beam_alpha, beam_beta, @@ -217,8 +217,8 @@ class DeepSpeech2Model(object): results = [result[0][1] for result in beam_search_results] else: - raise ValueError("Decoder method [%s] is not supported." % - decoder_method) + raise ValueError("Decoding method [%s] is not supported." % + decoding_method) return results def _create_parameters(self, model_path=None): diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index 966e1d9b660838cee625388ae22187665261259a..7cef7539b35b805030976303ea901e6d8081386e 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -16,7 +16,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('batch_size', int, 256, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('num_passes', int, 200, "# of training epochs.") -add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") @@ -28,7 +28,7 @@ add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('is_local', bool, True, "Use pserver or not.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('train_manifest', str, @@ -74,13 +74,13 @@ def train(): max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) dev_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest, batch_size=args.batch_size, diff --git a/deep_speech_2/tune.py b/deep_speech_2/tune.py index 62e8f288434c4366381ab76cc75ced8ea7d64a7c..eab00cfdb3ff54725767373df6a84ff4e4bc505e 100644 --- a/deep_speech_2/tune.py +++ b/deep_speech_2/tune.py @@ -18,7 +18,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") @@ -29,7 +29,7 @@ add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -104,14 +104,14 @@ def tune(): for alpha, beta in params_grid: result_transcripts = ds2_model.infer_batch( infer_data=tune_data, - decoder_method='ctc_beam_search', + decoding_method='ctc_beam_search', beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) wer_sum, num_ins = 0.0, 0 for target, result in zip(target_transcripts, result_transcripts): wer_sum += wer(target, result)