diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index db940639a3dfc24e3e1108cd55745562aaa2a74a..758799716a15dc908c8dcc1afe018bd5aa441810 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -24,8 +24,6 @@ ## Installation -### Basic setup - Please make sure the above [prerequisites](#prerequisites) have been satisfied before moving on. ```bash @@ -34,16 +32,6 @@ cd models/deep_speech_2 sh setup.sh ``` -### Decoders setup - -```bash -cd decoders/swig -sh setup.sh -cd ../.. -``` - -These commands will install the decoders that translate the ouptut probability vectors of DS2 model to text data, incuding CTC greedy decoder, CTC beam search decoder and its batch version. And a detailed usuage about them will be given in the following sections. - ## Getting Started Several shell scripts provided in `./examples` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data. @@ -189,7 +177,6 @@ Data augmentation has often been a highly effective technique to boost the deep Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline. ### Inference - - Volume Perturbation - Speed Perturbation - Shifting Perturbation diff --git a/deep_speech_2/data_utils/featurizer/text_featurizer.py b/deep_speech_2/data_utils/featurizer/text_featurizer.py index 89202163ca8d8b69f59b858db5451882d7e089b3..95dc637e0d76cc310cc732bd058215cedf9b007c 100644 --- a/deep_speech_2/data_utils/featurizer/text_featurizer.py +++ b/deep_speech_2/data_utils/featurizer/text_featurizer.py @@ -22,6 +22,8 @@ class TextFeaturizer(object): def __init__(self, vocab_filepath): self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( vocab_filepath) + # from unicode to string + self._vocab_list = [chars.encode("utf-8") for chars in self._vocab_list] def featurize(self, text): """Convert text string to a list of token indices in char-level.Note diff --git a/deep_speech_2/decoders/swig/ctc_decoders.cpp b/deep_speech_2/decoders/swig/ctc_decoders.cpp index b52394b6e19a8da4192accb1741502b0d19bbbc9..e86bfe0f2c554e56f54283db79444985f863fb41 100644 --- a/deep_speech_2/decoders/swig/ctc_decoders.cpp +++ b/deep_speech_2/decoders/swig/ctc_decoders.cpp @@ -17,41 +17,38 @@ std::string ctc_greedy_decoder( const std::vector> &probs_seq, const std::vector &vocabulary) { // dimension check - int num_time_steps = probs_seq.size(); - for (int i = 0; i < num_time_steps; i++) { - if (probs_seq[i].size() != vocabulary.size() + 1) { - std::cout << "The shape of probs_seq does not match" - << " with the shape of the vocabulary!" << std::endl; - exit(1); - } + size_t num_time_steps = probs_seq.size(); + for (size_t i = 0; i < num_time_steps; i++) { + VALID_CHECK_EQ(probs_seq[i].size(), + vocabulary.size() + 1, + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); } - int blank_id = vocabulary.size(); + size_t blank_id = vocabulary.size(); - std::vector max_idx_vec; - double max_prob = 0.0; - int max_idx = 0; - for (int i = 0; i < num_time_steps; i++) { - for (int j = 0; j < probs_seq[i].size(); j++) { + std::vector max_idx_vec; + for (size_t i = 0; i < num_time_steps; i++) { + double max_prob = 0.0; + size_t max_idx = 0; + for (size_t j = 0; j < probs_seq[i].size(); j++) { if (max_prob < probs_seq[i][j]) { max_idx = j; max_prob = probs_seq[i][j]; } } max_idx_vec.push_back(max_idx); - max_prob = 0.0; - max_idx = 0; } - std::vector idx_vec; - for (int i = 0; i < max_idx_vec.size(); i++) { + std::vector idx_vec; + for (size_t i = 0; i < max_idx_vec.size(); i++) { if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) { idx_vec.push_back(max_idx_vec[i]); } } std::string best_path_result; - for (int i = 0; i < idx_vec.size(); i++) { + for (size_t i = 0; i < idx_vec.size(); i++) { if (idx_vec[i] != blank_id) { best_path_result += vocabulary[idx_vec[i]]; } @@ -61,29 +58,24 @@ std::string ctc_greedy_decoder( std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, - int beam_size, + const size_t beam_size, std::vector vocabulary, - int blank_id, - double cutoff_prob, - int cutoff_top_n, - Scorer *extscorer) { + const double cutoff_prob, + const size_t cutoff_top_n, + Scorer *ext_scorer) { // dimension check size_t num_time_steps = probs_seq.size(); - for (int i = 0; i < num_time_steps; i++) { - if (probs_seq[i].size() != vocabulary.size() + 1) { - std::cout << " The shape of probs_seq does not match" - << " with the shape of the vocabulary!" << std::endl; - exit(1); - } + for (size_t i = 0; i < num_time_steps; i++) { + VALID_CHECK_EQ(probs_seq[i].size(), + vocabulary.size() + 1, + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); } - // blank_id check - if (blank_id > vocabulary.size()) { - std::cout << " Invalid blank_id! " << std::endl; - exit(1); - } + // assign blank id + size_t blank_id = vocabulary.size(); - // assign space ID + // assign space id std::vector::iterator it = std::find(vocabulary.begin(), vocabulary.end(), " "); int space_id = it - vocabulary.begin(); @@ -98,16 +90,16 @@ std::vector> ctc_beam_search_decoder( std::vector prefixes; prefixes.push_back(&root); - if (extscorer != nullptr) { - if (extscorer->is_char_map_empty()) { - extscorer->set_char_map(vocabulary); + if (ext_scorer != nullptr) { + if (ext_scorer->is_char_map_empty()) { + ext_scorer->set_char_map(vocabulary); } - if (!extscorer->is_character_based()) { - if (extscorer->dictionary == nullptr) { + if (!ext_scorer->is_character_based()) { + if (ext_scorer->dictionary == nullptr) { // fill dictionary for fst with space - extscorer->fill_dictionary(true); + ext_scorer->fill_dictionary(true); } - auto fst_dict = static_cast(extscorer->dictionary); + auto fst_dict = static_cast(ext_scorer->dictionary); fst::StdVectorFst *dict_ptr = fst_dict->Copy(true); root.set_dictionary(dict_ptr); auto matcher = std::make_shared(*dict_ptr, fst::MATCH_INPUT); @@ -116,33 +108,33 @@ std::vector> ctc_beam_search_decoder( } // prefix search over time - for (int time_step = 0; time_step < num_time_steps; time_step++) { + for (size_t time_step = 0; time_step < num_time_steps; time_step++) { std::vector prob = probs_seq[time_step]; std::vector> prob_idx; - for (int i = 0; i < prob.size(); i++) { + for (size_t i = 0; i < prob.size(); i++) { prob_idx.push_back(std::pair(i, prob[i])); } float min_cutoff = -NUM_FLT_INF; bool full_beam = false; - if (extscorer != nullptr) { - int num_prefixes = std::min((int)prefixes.size(), beam_size); + if (ext_scorer != nullptr) { + size_t num_prefixes = std::min(prefixes.size(), beam_size); std::sort( prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); min_cutoff = prefixes[num_prefixes - 1]->score + log(prob[blank_id]) - - std::max(0.0, extscorer->beta); + std::max(0.0, ext_scorer->beta); full_beam = (num_prefixes == beam_size); } // pruning of vacobulary - int cutoff_len = prob.size(); + size_t cutoff_len = prob.size(); if (cutoff_prob < 1.0 || cutoff_top_n < prob.size()) { std::sort( prob_idx.begin(), prob_idx.end(), pair_comp_second_rev); if (cutoff_prob < 1.0) { double cum_prob = 0.0; cutoff_len = 0; - for (int i = 0; i < prob_idx.size(); i++) { + for (size_t i = 0; i < prob_idx.size(); i++) { cum_prob += prob_idx[i].second; cutoff_len += 1; if (cum_prob >= cutoff_prob) break; @@ -152,18 +144,18 @@ std::vector> ctc_beam_search_decoder( prob_idx = std::vector>( prob_idx.begin(), prob_idx.begin() + cutoff_len); } - std::vector> log_prob_idx; - for (int i = 0; i < cutoff_len; i++) { + std::vector> log_prob_idx; + for (size_t i = 0; i < cutoff_len; i++) { log_prob_idx.push_back(std::pair( prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN))); } // loop over chars - for (int index = 0; index < log_prob_idx.size(); index++) { + for (size_t index = 0; index < log_prob_idx.size(); index++) { auto c = log_prob_idx[index].first; float log_prob_c = log_prob_idx[index].second; - for (int i = 0; i < prefixes.size() && i < beam_size; i++) { + for (size_t i = 0; i < prefixes.size() && i < beam_size; i++) { auto prefix = prefixes[i]; if (full_beam && log_prob_c + prefix->score < min_cutoff) { @@ -194,12 +186,12 @@ std::vector> ctc_beam_search_decoder( } // language model scoring - if (extscorer != nullptr && - (c == space_id || extscorer->is_character_based())) { + if (ext_scorer != nullptr && + (c == space_id || ext_scorer->is_character_based())) { PathTrie *prefix_toscore = nullptr; // skip scoring the space - if (extscorer->is_character_based()) { + if (ext_scorer->is_character_based()) { prefix_toscore = prefix_new; } else { prefix_toscore = prefix; @@ -207,11 +199,11 @@ std::vector> ctc_beam_search_decoder( double score = 0.0; std::vector ngram; - ngram = extscorer->make_ngram(prefix_toscore); - score = extscorer->get_log_cond_prob(ngram) * extscorer->alpha; + ngram = ext_scorer->make_ngram(prefix_toscore); + score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; log_p += score; - log_p += extscorer->beta; + log_p += ext_scorer->beta; } prefix_new->log_prob_nb_cur = log_sum_exp(prefix_new->log_prob_nb_cur, log_p); @@ -240,15 +232,15 @@ std::vector> ctc_beam_search_decoder( for (size_t i = 0; i < beam_size && i < prefixes.size(); i++) { double approx_ctc = prefixes[i]->score; - if (extscorer != nullptr) { + if (ext_scorer != nullptr) { std::vector output; prefixes[i]->get_path_vec(output); size_t prefix_length = output.size(); - auto words = extscorer->split_labels(output); + auto words = ext_scorer->split_labels(output); // remove word insert - approx_ctc = approx_ctc - prefix_length * extscorer->beta; + approx_ctc = approx_ctc - prefix_length * ext_scorer->beta; // remove language model weight: - approx_ctc -= (extscorer->get_sent_log_prob(words)) * extscorer->alpha; + approx_ctc -= (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha; } prefixes[i]->approx_ctc = approx_ctc; @@ -269,7 +261,7 @@ std::vector> ctc_beam_search_decoder( space_prefixes[i]->get_path_vec(output); // convert index to string std::string output_str; - for (int j = 0; j < output.size(); j++) { + for (size_t j = 0; j < output.size(); j++) { output_str += vocabulary[output[j]]; } std::pair output_pair(-space_prefixes[i]->approx_ctc, @@ -283,49 +275,45 @@ std::vector> ctc_beam_search_decoder( std::vector>> ctc_beam_search_decoder_batch( const std::vector>> &probs_split, - int beam_size, + const size_t beam_size, const std::vector &vocabulary, - int blank_id, - int num_processes, - double cutoff_prob, - int cutoff_top_n, - Scorer *extscorer) { - if (num_processes <= 0) { - std::cout << "num_processes must be nonnegative!" << std::endl; - exit(1); - } + const size_t num_processes, + const double cutoff_prob, + const size_t cutoff_top_n, + Scorer *ext_scorer) { + VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); // thread pool ThreadPool pool(num_processes); // number of samples - int batch_size = probs_split.size(); + size_t batch_size = probs_split.size(); // scorer filling up - if (extscorer != nullptr) { - if (extscorer->is_char_map_empty()) { - extscorer->set_char_map(vocabulary); + if (ext_scorer != nullptr) { + if (ext_scorer->is_char_map_empty()) { + ext_scorer->set_char_map(vocabulary); } - if (!extscorer->is_character_based() && extscorer->dictionary == nullptr) { + if (!ext_scorer->is_character_based() && + ext_scorer->dictionary == nullptr) { // init dictionary - extscorer->fill_dictionary(true); + ext_scorer->fill_dictionary(true); } } // enqueue the tasks of decoding std::vector>>> res; - for (int i = 0; i < batch_size; i++) { + for (size_t i = 0; i < batch_size; i++) { res.emplace_back(pool.enqueue(ctc_beam_search_decoder, probs_split[i], beam_size, vocabulary, - blank_id, cutoff_prob, cutoff_top_n, - extscorer)); + ext_scorer)); } // get decoding results std::vector>> batch_results; - for (int i = 0; i < batch_size; i++) { + for (size_t i = 0; i < batch_size; i++) { batch_results.emplace_back(res[i].get()); } return batch_results; diff --git a/deep_speech_2/decoders/swig/ctc_decoders.h b/deep_speech_2/decoders/swig/ctc_decoders.h index b8c512bda84a4abb3a9b36c98e5896274361d001..6384c8a8fc017425bdd0b8fdf8210684a19c095a 100644 --- a/deep_speech_2/decoders/swig/ctc_decoders.h +++ b/deep_speech_2/decoders/swig/ctc_decoders.h @@ -27,21 +27,21 @@ std::string ctc_greedy_decoder( * over vocabulary of one time step. * beam_size: The width of beam search. * vocabulary: A vector of vocabulary. - * blank_id: ID of blank. * cutoff_prob: Cutoff probability for pruning. * cutoff_top_n: Cutoff number for pruning. - * ext_scorer: External scorer to evaluate a prefix. + * ext_scorer: External scorer to evaluate a prefix, which consists of + * n-gram language model scoring and word insertion term. + * Default null, decoding the input sample without scorer. * Return: * A vector that each element is a pair of score and decoding result, * in desending order. */ std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, - int beam_size, + const size_t beam_size, std::vector vocabulary, - int blank_id, - double cutoff_prob = 1.0, - int cutoff_top_n = 40, + const double cutoff_prob = 1.0, + const size_t cutoff_top_n = 40, Scorer *ext_scorer = NULL); /* CTC Beam Search Decoder for batch data @@ -52,11 +52,12 @@ std::vector> ctc_beam_search_decoder( * . * beam_size: The width of beam search. * vocabulary: A vector of vocabulary. - * blank_id: ID of blank. * num_processes: Number of threads for beam search. * cutoff_prob: Cutoff probability for pruning. * cutoff_top_n: Cutoff number for pruning. - * ext_scorer: External scorer to evaluate a prefix. + * ext_scorer: External scorer to evaluate a prefix, which consists of + * n-gram language model scoring and word insertion term. + * Default null, decoding the input sample without scorer. * Return: * A 2-D vector that each element is a vector of beam search decoding * result for one audio sample. @@ -64,12 +65,11 @@ std::vector> ctc_beam_search_decoder( std::vector>> ctc_beam_search_decoder_batch( const std::vector>> &probs_split, - int beam_size, + const size_t beam_size, const std::vector &vocabulary, - int blank_id, - int num_processes, + const size_t num_processes, double cutoff_prob = 1.0, - int cutoff_top_n = 40, + const size_t cutoff_top_n = 40, Scorer *ext_scorer = NULL); #endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/deep_speech_2/decoders/swig/decoder_utils.h b/deep_speech_2/decoders/swig/decoder_utils.h index d4ee36e1bfcbe586939ca3e9acb4ef2857a8433a..015646ddd7beed114a454fca8e577e6b08221964 100644 --- a/deep_speech_2/decoders/swig/decoder_utils.h +++ b/deep_speech_2/decoders/swig/decoder_utils.h @@ -7,6 +7,22 @@ const float NUM_FLT_INF = std::numeric_limits::max(); const float NUM_FLT_MIN = std::numeric_limits::min(); +// check if __A == _B +#define VALID_CHECK_EQ(__A, __B, __ERR) \ + if ((__A) != (__B)) { \ + std::ostringstream str; \ + str << (__A) << " != " << (__B) << ", "; \ + throw std::runtime_error(str.str() + __ERR); \ + } + +// check if __A > __B +#define VALID_CHECK_GT(__A, __B, __ERR) \ + if ((__A) <= (__B)) { \ + std::ostringstream str; \ + str << (__A) << " <= " << (__B) << ", "; \ + throw std::runtime_error(str.str() + __ERR); \ + } + // Function template for comparing two pairs template bool pair_comp_first_rev(const std::pair &a, diff --git a/deep_speech_2/decoders/swig_wrapper.py b/deep_speech_2/decoders/swig_wrapper.py index 202440bfba2e23e05278d41fdd8ebeedbd797f32..54ed249f3859e7b74ab829019324a784e9d8ce0b 100644 --- a/deep_speech_2/decoders/swig_wrapper.py +++ b/deep_speech_2/decoders/swig_wrapper.py @@ -41,7 +41,6 @@ def ctc_greedy_decoder(probs_seq, vocabulary): def ctc_beam_search_decoder(probs_seq, beam_size, vocabulary, - blank_id, cutoff_prob=1.0, cutoff_top_n=40, ext_scoring_func=None): @@ -55,8 +54,6 @@ def ctc_beam_search_decoder(probs_seq, :type beam_size: int :param vocabulary: Vocabulary list. :type vocabulary: list - :param blank_id: ID of blank. - :type blank_id: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. :type cutoff_prob: float @@ -72,15 +69,14 @@ def ctc_beam_search_decoder(probs_seq, results, in descending order of the probability. :rtype: list """ - return swig_decoders.ctc_beam_search_decoder( - probs_seq.tolist(), beam_size, vocabulary, blank_id, cutoff_prob, - cutoff_top_n, ext_scoring_func) + return swig_decoders.ctc_beam_search_decoder(probs_seq.tolist(), beam_size, + vocabulary, cutoff_prob, + cutoff_top_n, ext_scoring_func) def ctc_beam_search_decoder_batch(probs_split, beam_size, vocabulary, - blank_id, num_processes, cutoff_prob=1.0, cutoff_top_n=40, @@ -94,8 +90,6 @@ def ctc_beam_search_decoder_batch(probs_split, :type beam_size: int :param vocabulary: Vocabulary list. :type vocabulary: list - :param blank_id: ID of blank. - :type blank_id: int :param num_processes: Number of parallel processes. :type num_processes: int :param cutoff_prob: Cutoff probability in vocabulary pruning, @@ -118,5 +112,5 @@ def ctc_beam_search_decoder_batch(probs_split, probs_split = [probs_seq.tolist() for probs_seq in probs_split] return swig_decoders.ctc_beam_search_decoder_batch( - probs_split, beam_size, vocabulary, blank_id, num_processes, - cutoff_prob, cutoff_top_n, ext_scoring_func) + probs_split, beam_size, vocabulary, num_processes, cutoff_prob, + cutoff_top_n, ext_scoring_func) diff --git a/deep_speech_2/examples/librispeech/run_test_golden.sh b/deep_speech_2/examples/librispeech/run_test_golden.sh index 080c3c0622d62169d63f0e1f1bf3d9ceb7d24da0..e539bd0137251e1d81503511aae7e4b02b8d5e96 100644 --- a/deep_speech_2/examples/librispeech/run_test_golden.sh +++ b/deep_speech_2/examples/librispeech/run_test_golden.sh @@ -31,13 +31,13 @@ python -u test.py \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=2048 \ ---alpha=0.36 \ ---beta=0.25 \ ---cutoff_prob=0.99 \ +--alpha=2.15 \ +--beta=0.35 \ +--cutoff_prob=1.0 \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---test_manifest='data/tiny/manifest.test-clean' \ +--test_manifest='data/librispeech/manifest.test-clean' \ --mean_std_path='models/librispeech/mean_std.npz' \ --vocab_path='models/librispeech/vocab.txt' \ --model_path='models/librispeech/params.tar.gz' \ diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 48c4ef493a47f9d10b2b534522b2430f52add12b..5da1db970c13c7356f0e2b8ca05efa072ac2ba8e 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -21,9 +21,9 @@ add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('alpha', float, 2.15, "Coef of LM for beam search.") +add_arg('beta', float, 0.35, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " @@ -85,7 +85,6 @@ def infer(): pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) - vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] result_transcripts = ds2_model.infer_batch( infer_data=infer_data, decoding_method=args.decoding_method, @@ -93,7 +92,7 @@ def infer(): beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, - vocab_list=vocab_list, + vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) diff --git a/deep_speech_2/model_utils/model.py b/deep_speech_2/model_utils/model.py index 5812afca6ec54ef47afe1a0735258af4e5051221..1a9910e9d9f5a0129a22c7e93cbd1c4a272eb89e 100644 --- a/deep_speech_2/model_utils/model.py +++ b/deep_speech_2/model_utils/model.py @@ -214,7 +214,6 @@ class DeepSpeech2Model(object): probs_split=probs_split, vocabulary=vocab_list, beam_size=beam_size, - blank_id=len(vocab_list), num_processes=num_processes, ext_scoring_func=self._ext_scorer, cutoff_prob=cutoff_prob) diff --git a/deep_speech_2/setup.sh b/deep_speech_2/setup.sh index 6c8a709941ae94124149482f1886bf445c170af8..dcb3e0fbc21b2a9502f12d32cf75e3ab3b344b1a 100644 --- a/deep_speech_2/setup.sh +++ b/deep_speech_2/setup.sh @@ -26,4 +26,13 @@ if [ $? != 0 ]; then rm libsndfile-1.0.28.tar.gz fi +# install decoders +python -c "import swig_decoders" +if [ $? != 0 ]; then + pushd decoders/swig > /dev/null + sh setup.sh + popd > /dev/null +fi + + echo "Install all dependencies successfully." diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py index 499f71f62a627e4d9b94a35c7972e974cc4b6c9a..76efb4d1e196fcfe40358b028bfe966f224fb8eb 100644 --- a/deep_speech_2/test.py +++ b/deep_speech_2/test.py @@ -22,9 +22,9 @@ add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('alpha', float, 2.15, "Coef of LM for beam search.") +add_arg('beta', float, 0.35, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " @@ -85,7 +85,6 @@ def evaluate(): pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) - vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] error_rate_func = cer if args.error_rate_type == 'cer' else wer error_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): @@ -96,7 +95,7 @@ def evaluate(): beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, - vocab_list=vocab_list, + vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) target_transcripts = [ diff --git a/deep_speech_2/utils/utility.sh b/deep_speech_2/utils/utility.sh index c8121126a1f4369bb0289f1f1b5892cd5556ff77..aa0ec002bca0597bd61728180e2fb1ccdd02e5d6 100644 --- a/deep_speech_2/utils/utility.sh +++ b/deep_speech_2/utils/utility.sh @@ -13,7 +13,7 @@ download() { wget -c $URL -P `dirname "$TARGET"` md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'` - if [ $MD5 -ne $md5_result ]; then + if [ ! $MD5 == $md5_result ]; then echo "Fail to download the language model!" return 1 fi