From a18e6a7eda2a936c567feae67bbab7bd732c8d17 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Sep 2017 22:30:40 +0800
Subject: [PATCH] refine by following review comments

---
 README.md                                |  13 --
 data_utils/featurizer/text_featurizer.py |   2 +
 decoders/swig/ctc_decoders.cpp           | 156 +++++++++++------------
 decoders/swig/ctc_decoders.h             |  24 ++--
 decoders/swig/decoder_utils.h            |  16 +++
 decoders/swig_wrapper.py                 |  16 +--
 examples/librispeech/run_test_golden.sh  |   8 +-
 infer.py                                 |   9 +-
 model_utils/model.py                     |   1 -
 setup.sh                                 |   9 ++
 test.py                                  |   9 +-
 utils/utility.sh                         |   2 +-
 12 files changed, 129 insertions(+), 136 deletions(-)
diff --git a/README.md b/README.md
index db940639..75879971 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,6 @@
 
 ## Installation
 
-### Basic setup
-
 Please make sure the above [prerequisites](#prerequisites) have been satisfied before moving on.
 
 ```bash
@@ -34,16 +32,6 @@ cd models/deep_speech_2
 sh setup.sh
 ```
 
-### Decoders setup
-
-```bash
-cd decoders/swig
-sh setup.sh
-cd ../..
-```
-
-These commands will install the decoders that translate the ouptut probability  vectors of DS2 model to text data, incuding CTC greedy decoder, CTC beam search decoder and its batch version. And a detailed usuage about them will be given in the following sections.
-
 ## Getting Started
 
 Several shell scripts provided in `./examples` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
@@ -189,7 +177,6 @@ Data augmentation has often been a highly effective technique to boost the deep
 Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline.
 
 ### Inference
-
   - Volume Perturbation
   - Speed Perturbation
   - Shifting Perturbation
diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py
index 89202163..95dc637e 100644
--- a/data_utils/featurizer/text_featurizer.py
+++ b/data_utils/featurizer/text_featurizer.py
@@ -22,6 +22,8 @@ class TextFeaturizer(object):
     def __init__(self, vocab_filepath):
         self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
             vocab_filepath)
+        # from unicode to string
+        self._vocab_list = [chars.encode("utf-8") for chars in self._vocab_list]
 
     def featurize(self, text):
         """Convert text string to a list of token indices in char-level.Note
diff --git a/decoders/swig/ctc_decoders.cpp b/decoders/swig/ctc_decoders.cpp
index b52394b6..e86bfe0f 100644
--- a/decoders/swig/ctc_decoders.cpp
+++ b/decoders/swig/ctc_decoders.cpp
@@ -17,41 +17,38 @@ std::string ctc_greedy_decoder(
     const std::vector<std::vector<double>> &probs_seq,
     const std::vector<std::string> &vocabulary) {
   // dimension check
-  int num_time_steps = probs_seq.size();
-  for (int i = 0; i < num_time_steps; i++) {
-    if (probs_seq[i].size() != vocabulary.size() + 1) {
-      std::cout << "The shape of probs_seq does not match"
-                << " with the shape of the vocabulary!" << std::endl;
-      exit(1);
-    }
+  size_t num_time_steps = probs_seq.size();
+  for (size_t i = 0; i < num_time_steps; i++) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
   }
 
-  int blank_id = vocabulary.size();
+  size_t blank_id = vocabulary.size();
 
-  std::vector<int> max_idx_vec;
-  double max_prob = 0.0;
-  int max_idx = 0;
-  for (int i = 0; i < num_time_steps; i++) {
-    for (int j = 0; j < probs_seq[i].size(); j++) {
+  std::vector<size_t> max_idx_vec;
+  for (size_t i = 0; i < num_time_steps; i++) {
+    double max_prob = 0.0;
+    size_t max_idx = 0;
+    for (size_t j = 0; j < probs_seq[i].size(); j++) {
       if (max_prob < probs_seq[i][j]) {
         max_idx = j;
         max_prob = probs_seq[i][j];
       }
     }
     max_idx_vec.push_back(max_idx);
-    max_prob = 0.0;
-    max_idx = 0;
   }
 
-  std::vector<int> idx_vec;
-  for (int i = 0; i < max_idx_vec.size(); i++) {
+  std::vector<size_t> idx_vec;
+  for (size_t i = 0; i < max_idx_vec.size(); i++) {
     if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) {
       idx_vec.push_back(max_idx_vec[i]);
     }
   }
 
   std::string best_path_result;
-  for (int i = 0; i < idx_vec.size(); i++) {
+  for (size_t i = 0; i < idx_vec.size(); i++) {
     if (idx_vec[i] != blank_id) {
       best_path_result += vocabulary[idx_vec[i]];
     }
@@ -61,29 +58,24 @@ std::string ctc_greedy_decoder(
 
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
     const std::vector<std::vector<double>> &probs_seq,
-    int beam_size,
+    const size_t beam_size,
     std::vector<std::string> vocabulary,
-    int blank_id,
-    double cutoff_prob,
-    int cutoff_top_n,
-    Scorer *extscorer) {
+    const double cutoff_prob,
+    const size_t cutoff_top_n,
+    Scorer *ext_scorer) {
   // dimension check
   size_t num_time_steps = probs_seq.size();
-  for (int i = 0; i < num_time_steps; i++) {
-    if (probs_seq[i].size() != vocabulary.size() + 1) {
-      std::cout << " The shape of probs_seq does not match"
-                << " with the shape of the vocabulary!" << std::endl;
-      exit(1);
-    }
+  for (size_t i = 0; i < num_time_steps; i++) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
   }
 
-  // blank_id check
-  if (blank_id > vocabulary.size()) {
-    std::cout << " Invalid blank_id! " << std::endl;
-    exit(1);
-  }
+  // assign blank id
+  size_t blank_id = vocabulary.size();
 
-  // assign space ID
+  // assign space id
   std::vector<std::string>::iterator it =
       std::find(vocabulary.begin(), vocabulary.end(), " ");
   int space_id = it - vocabulary.begin();
@@ -98,16 +90,16 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
   std::vector<PathTrie *> prefixes;
   prefixes.push_back(&root);
 
-  if (extscorer != nullptr) {
-    if (extscorer->is_char_map_empty()) {
-      extscorer->set_char_map(vocabulary);
+  if (ext_scorer != nullptr) {
+    if (ext_scorer->is_char_map_empty()) {
+      ext_scorer->set_char_map(vocabulary);
     }
-    if (!extscorer->is_character_based()) {
-      if (extscorer->dictionary == nullptr) {
+    if (!ext_scorer->is_character_based()) {
+      if (ext_scorer->dictionary == nullptr) {
         // fill dictionary for fst with space
-        extscorer->fill_dictionary(true);
+        ext_scorer->fill_dictionary(true);
       }
-      auto fst_dict = static_cast<fst::StdVectorFst *>(extscorer->dictionary);
+      auto fst_dict = static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
       fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
       root.set_dictionary(dict_ptr);
       auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
@@ -116,33 +108,33 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
   }
 
   // prefix search over time
-  for (int time_step = 0; time_step < num_time_steps; time_step++) {
+  for (size_t time_step = 0; time_step < num_time_steps; time_step++) {
     std::vector<double> prob = probs_seq[time_step];
     std::vector<std::pair<int, double>> prob_idx;
-    for (int i = 0; i < prob.size(); i++) {
+    for (size_t i = 0; i < prob.size(); i++) {
       prob_idx.push_back(std::pair<int, double>(i, prob[i]));
     }
 
     float min_cutoff = -NUM_FLT_INF;
     bool full_beam = false;
-    if (extscorer != nullptr) {
-      int num_prefixes = std::min((int)prefixes.size(), beam_size);
+    if (ext_scorer != nullptr) {
+      size_t num_prefixes = std::min(prefixes.size(), beam_size);
       std::sort(
           prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
       min_cutoff = prefixes[num_prefixes - 1]->score + log(prob[blank_id]) -
-                   std::max(0.0, extscorer->beta);
+                   std::max(0.0, ext_scorer->beta);
       full_beam = (num_prefixes == beam_size);
     }
 
     // pruning of vacobulary
-    int cutoff_len = prob.size();
+    size_t cutoff_len = prob.size();
     if (cutoff_prob < 1.0 || cutoff_top_n < prob.size()) {
       std::sort(
           prob_idx.begin(), prob_idx.end(), pair_comp_second_rev<int, double>);
       if (cutoff_prob < 1.0) {
         double cum_prob = 0.0;
         cutoff_len = 0;
-        for (int i = 0; i < prob_idx.size(); i++) {
+        for (size_t i = 0; i < prob_idx.size(); i++) {
           cum_prob += prob_idx[i].second;
           cutoff_len += 1;
           if (cum_prob >= cutoff_prob) break;
@@ -152,18 +144,18 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
       prob_idx = std::vector<std::pair<int, double>>(
           prob_idx.begin(), prob_idx.begin() + cutoff_len);
     }
-    std::vector<std::pair<int, float>> log_prob_idx;
-    for (int i = 0; i < cutoff_len; i++) {
+    std::vector<std::pair<size_t, float>> log_prob_idx;
+    for (size_t i = 0; i < cutoff_len; i++) {
       log_prob_idx.push_back(std::pair<int, float>(
           prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN)));
     }
 
     // loop over chars
-    for (int index = 0; index < log_prob_idx.size(); index++) {
+    for (size_t index = 0; index < log_prob_idx.size(); index++) {
       auto c = log_prob_idx[index].first;
       float log_prob_c = log_prob_idx[index].second;
 
-      for (int i = 0; i < prefixes.size() && i < beam_size; i++) {
+      for (size_t i = 0; i < prefixes.size() && i < beam_size; i++) {
         auto prefix = prefixes[i];
 
         if (full_beam && log_prob_c + prefix->score < min_cutoff) {
@@ -194,12 +186,12 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
           }
 
           // language model scoring
-          if (extscorer != nullptr &&
-              (c == space_id || extscorer->is_character_based())) {
+          if (ext_scorer != nullptr &&
+              (c == space_id || ext_scorer->is_character_based())) {
             PathTrie *prefix_toscore = nullptr;
 
             // skip scoring the space
-            if (extscorer->is_character_based()) {
+            if (ext_scorer->is_character_based()) {
               prefix_toscore = prefix_new;
             } else {
               prefix_toscore = prefix;
@@ -207,11 +199,11 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
 
             double score = 0.0;
             std::vector<std::string> ngram;
-            ngram = extscorer->make_ngram(prefix_toscore);
-            score = extscorer->get_log_cond_prob(ngram) * extscorer->alpha;
+            ngram = ext_scorer->make_ngram(prefix_toscore);
+            score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
 
             log_p += score;
-            log_p += extscorer->beta;
+            log_p += ext_scorer->beta;
           }
           prefix_new->log_prob_nb_cur =
               log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
@@ -240,15 +232,15 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
   for (size_t i = 0; i < beam_size && i < prefixes.size(); i++) {
     double approx_ctc = prefixes[i]->score;
 
-    if (extscorer != nullptr) {
+    if (ext_scorer != nullptr) {
       std::vector<int> output;
       prefixes[i]->get_path_vec(output);
       size_t prefix_length = output.size();
-      auto words = extscorer->split_labels(output);
+      auto words = ext_scorer->split_labels(output);
       // remove word insert
-      approx_ctc = approx_ctc - prefix_length * extscorer->beta;
+      approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
       // remove language model weight:
-      approx_ctc -= (extscorer->get_sent_log_prob(words)) * extscorer->alpha;
+      approx_ctc -= (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
     }
 
     prefixes[i]->approx_ctc = approx_ctc;
@@ -269,7 +261,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
     space_prefixes[i]->get_path_vec(output);
     // convert index to string
     std::string output_str;
-    for (int j = 0; j < output.size(); j++) {
+    for (size_t j = 0; j < output.size(); j++) {
       output_str += vocabulary[output[j]];
     }
     std::pair<double, std::string> output_pair(-space_prefixes[i]->approx_ctc,
@@ -283,49 +275,45 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
 std::vector<std::vector<std::pair<double, std::string>>>
 ctc_beam_search_decoder_batch(
     const std::vector<std::vector<std::vector<double>>> &probs_split,
-    int beam_size,
+    const size_t beam_size,
     const std::vector<std::string> &vocabulary,
-    int blank_id,
-    int num_processes,
-    double cutoff_prob,
-    int cutoff_top_n,
-    Scorer *extscorer) {
-  if (num_processes <= 0) {
-    std::cout << "num_processes must be nonnegative!" << std::endl;
-    exit(1);
-  }
+    const size_t num_processes,
+    const double cutoff_prob,
+    const size_t cutoff_top_n,
+    Scorer *ext_scorer) {
+  VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
   // thread pool
   ThreadPool pool(num_processes);
   // number of samples
-  int batch_size = probs_split.size();
+  size_t batch_size = probs_split.size();
 
   // scorer filling up
-  if (extscorer != nullptr) {
-    if (extscorer->is_char_map_empty()) {
-      extscorer->set_char_map(vocabulary);
+  if (ext_scorer != nullptr) {
+    if (ext_scorer->is_char_map_empty()) {
+      ext_scorer->set_char_map(vocabulary);
     }
-    if (!extscorer->is_character_based() && extscorer->dictionary == nullptr) {
+    if (!ext_scorer->is_character_based() &&
+        ext_scorer->dictionary == nullptr) {
       // init dictionary
-      extscorer->fill_dictionary(true);
+      ext_scorer->fill_dictionary(true);
     }
   }
 
   // enqueue the tasks of decoding
   std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
-  for (int i = 0; i < batch_size; i++) {
+  for (size_t i = 0; i < batch_size; i++) {
     res.emplace_back(pool.enqueue(ctc_beam_search_decoder,
                                   probs_split[i],
                                   beam_size,
                                   vocabulary,
-                                  blank_id,
                                   cutoff_prob,
                                   cutoff_top_n,
-                                  extscorer));
+                                  ext_scorer));
   }
 
   // get decoding results
   std::vector<std::vector<std::pair<double, std::string>>> batch_results;
-  for (int i = 0; i < batch_size; i++) {
+  for (size_t i = 0; i < batch_size; i++) {
     batch_results.emplace_back(res[i].get());
   }
   return batch_results;
diff --git a/decoders/swig/ctc_decoders.h b/decoders/swig/ctc_decoders.h
index b8c512bd..6384c8a8 100644
--- a/decoders/swig/ctc_decoders.h
+++ b/decoders/swig/ctc_decoders.h
@@ -27,21 +27,21 @@ std::string ctc_greedy_decoder(
  *               over vocabulary of one time step.
  *     beam_size: The width of beam search.
  *     vocabulary: A vector of vocabulary.
- *     blank_id: ID of blank.
  *     cutoff_prob: Cutoff probability for pruning.
  *     cutoff_top_n: Cutoff number for pruning.
- *     ext_scorer: External scorer to evaluate a prefix.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
  * Return:
  *     A vector that each element is a pair of score  and decoding result,
  *     in desending order.
 */
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
     const std::vector<std::vector<double>> &probs_seq,
-    int beam_size,
+    const size_t beam_size,
     std::vector<std::string> vocabulary,
-    int blank_id,
-    double cutoff_prob = 1.0,
-    int cutoff_top_n = 40,
+    const double cutoff_prob = 1.0,
+    const size_t cutoff_top_n = 40,
     Scorer *ext_scorer = NULL);
 
 /* CTC Beam Search Decoder for batch data
@@ -52,11 +52,12 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
  *      .
  *     beam_size: The width of beam search.
  *     vocabulary: A vector of vocabulary.
- *     blank_id: ID of blank.
  *     num_processes: Number of threads for beam search.
  *     cutoff_prob: Cutoff probability for pruning.
  *     cutoff_top_n: Cutoff number for pruning.
- *     ext_scorer: External scorer to evaluate a prefix.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
  * Return:
  *     A 2-D vector that each element is a vector of beam search decoding
  *     result for one audio sample.
@@ -64,12 +65,11 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
 std::vector<std::vector<std::pair<double, std::string>>>
 ctc_beam_search_decoder_batch(
     const std::vector<std::vector<std::vector<double>>> &probs_split,
-    int beam_size,
+    const size_t beam_size,
     const std::vector<std::string> &vocabulary,
-    int blank_id,
-    int num_processes,
+    const size_t num_processes,
     double cutoff_prob = 1.0,
-    int cutoff_top_n = 40,
+    const size_t cutoff_top_n = 40,
     Scorer *ext_scorer = NULL);
 
 #endif  // CTC_BEAM_SEARCH_DECODER_H_
diff --git a/decoders/swig/decoder_utils.h b/decoders/swig/decoder_utils.h
index d4ee36e1..015646dd 100644
--- a/decoders/swig/decoder_utils.h
+++ b/decoders/swig/decoder_utils.h
@@ -7,6 +7,22 @@
 const float NUM_FLT_INF = std::numeric_limits<float>::max();
 const float NUM_FLT_MIN = std::numeric_limits<float>::min();
 
+// check if __A == _B
+#define VALID_CHECK_EQ(__A, __B, __ERR)          \
+  if ((__A) != (__B)) {                          \
+    std::ostringstream str;                      \
+    str << (__A) << " != " << (__B) << ", ";     \
+    throw std::runtime_error(str.str() + __ERR); \
+  }
+
+// check if __A > __B
+#define VALID_CHECK_GT(__A, __B, __ERR)          \
+  if ((__A) <= (__B)) {                          \
+    std::ostringstream str;                      \
+    str << (__A) << " <= " << (__B) << ", ";     \
+    throw std::runtime_error(str.str() + __ERR); \
+  }
+
 // Function template for comparing two pairs
 template <typename T1, typename T2>
 bool pair_comp_first_rev(const std::pair<T1, T2> &a,
diff --git a/decoders/swig_wrapper.py b/decoders/swig_wrapper.py
index 202440bf..54ed249f 100644
--- a/decoders/swig_wrapper.py
+++ b/decoders/swig_wrapper.py
@@ -41,7 +41,6 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
 def ctc_beam_search_decoder(probs_seq,
                             beam_size,
                             vocabulary,
-                            blank_id,
                             cutoff_prob=1.0,
                             cutoff_top_n=40,
                             ext_scoring_func=None):
@@ -55,8 +54,6 @@ def ctc_beam_search_decoder(probs_seq,
     :type beam_size: int
     :param vocabulary: Vocabulary list.
     :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
     :param cutoff_prob: Cutoff probability in pruning,
                         default 1.0, no pruning.
     :type cutoff_prob: float
@@ -72,15 +69,14 @@ def ctc_beam_search_decoder(probs_seq,
              results, in descending order of the probability.
     :rtype: list
     """
-    return swig_decoders.ctc_beam_search_decoder(
-        probs_seq.tolist(), beam_size, vocabulary, blank_id, cutoff_prob,
-        cutoff_top_n, ext_scoring_func)
+    return swig_decoders.ctc_beam_search_decoder(probs_seq.tolist(), beam_size,
+                                                 vocabulary, cutoff_prob,
+                                                 cutoff_top_n, ext_scoring_func)
 
 
 def ctc_beam_search_decoder_batch(probs_split,
                                   beam_size,
                                   vocabulary,
-                                  blank_id,
                                   num_processes,
                                   cutoff_prob=1.0,
                                   cutoff_top_n=40,
@@ -94,8 +90,6 @@ def ctc_beam_search_decoder_batch(probs_split,
     :type beam_size: int
     :param vocabulary: Vocabulary list.
     :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
     :param num_processes: Number of parallel processes.
     :type num_processes: int
     :param cutoff_prob: Cutoff probability in vocabulary pruning,
@@ -118,5 +112,5 @@ def ctc_beam_search_decoder_batch(probs_split,
     probs_split = [probs_seq.tolist() for probs_seq in probs_split]
 
     return swig_decoders.ctc_beam_search_decoder_batch(
-        probs_split, beam_size, vocabulary, blank_id, num_processes,
-        cutoff_prob, cutoff_top_n, ext_scoring_func)
+        probs_split, beam_size, vocabulary, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func)
diff --git a/examples/librispeech/run_test_golden.sh b/examples/librispeech/run_test_golden.sh
index 080c3c06..e539bd01 100644
--- a/examples/librispeech/run_test_golden.sh
+++ b/examples/librispeech/run_test_golden.sh
@@ -31,13 +31,13 @@ python -u test.py \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
---alpha=0.36 \
---beta=0.25 \
---cutoff_prob=0.99 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
---test_manifest='data/tiny/manifest.test-clean' \
+--test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='models/librispeech/mean_std.npz' \
 --vocab_path='models/librispeech/vocab.txt' \
 --model_path='models/librispeech/params.tar.gz' \
diff --git a/infer.py b/infer.py
index 48c4ef49..5da1db97 100644
--- a/infer.py
+++ b/infer.py
@@ -21,9 +21,9 @@ add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
-add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
-add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('alpha',            float,  2.15,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.35,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  1.0,   "Cutoff probability for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
@@ -85,7 +85,6 @@ def infer():
         pretrained_model_path=args.model_path,
         share_rnn_weights=args.share_rnn_weights)
 
-    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
     result_transcripts = ds2_model.infer_batch(
         infer_data=infer_data,
         decoding_method=args.decoding_method,
@@ -93,7 +92,7 @@ def infer():
         beam_beta=args.beta,
         beam_size=args.beam_size,
         cutoff_prob=args.cutoff_prob,
-        vocab_list=vocab_list,
+        vocab_list=data_generator.vocab_list,
         language_model_path=args.lang_model_path,
         num_processes=args.num_proc_bsearch)
 
diff --git a/model_utils/model.py b/model_utils/model.py
index 5812afca..1a9910e9 100644
--- a/model_utils/model.py
+++ b/model_utils/model.py
@@ -214,7 +214,6 @@ class DeepSpeech2Model(object):
                 probs_split=probs_split,
                 vocabulary=vocab_list,
                 beam_size=beam_size,
-                blank_id=len(vocab_list),
                 num_processes=num_processes,
                 ext_scoring_func=self._ext_scorer,
                 cutoff_prob=cutoff_prob)
diff --git a/setup.sh b/setup.sh
index 6c8a7099..dcb3e0fb 100644
--- a/setup.sh
+++ b/setup.sh
@@ -26,4 +26,13 @@ if [ $? != 0 ]; then
     rm libsndfile-1.0.28.tar.gz
 fi
 
+# install decoders
+python -c "import swig_decoders"
+if [ $? != 0 ]; then
+    pushd decoders/swig > /dev/null
+    sh setup.sh
+    popd > /dev/null
+fi
+
+
 echo "Install all dependencies successfully."
diff --git a/test.py b/test.py
index 499f71f6..76efb4d1 100644
--- a/test.py
+++ b/test.py
@@ -22,9 +22,9 @@ add_arg('num_proc_data',    int,    12,     "# of CPUs for data preprocessing.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
-add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
-add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('alpha',            float,  2.15,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.35,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  1.0,   "Cutoff probability for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
@@ -85,7 +85,6 @@ def evaluate():
         pretrained_model_path=args.model_path,
         share_rnn_weights=args.share_rnn_weights)
 
-    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
     error_rate_func = cer if args.error_rate_type == 'cer' else wer
     error_sum, num_ins = 0.0, 0
     for infer_data in batch_reader():
@@ -96,7 +95,7 @@ def evaluate():
             beam_beta=args.beta,
             beam_size=args.beam_size,
             cutoff_prob=args.cutoff_prob,
-            vocab_list=vocab_list,
+            vocab_list=data_generator.vocab_list,
             language_model_path=args.lang_model_path,
             num_processes=args.num_proc_bsearch)
         target_transcripts = [
diff --git a/utils/utility.sh b/utils/utility.sh
index c8121126..aa0ec002 100644
--- a/utils/utility.sh
+++ b/utils/utility.sh
@@ -13,7 +13,7 @@ download() {
 
     wget -c $URL -P `dirname "$TARGET"`
     md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
-    if [ $MD5 -ne $md5_result ]; then
+    if [ ! $MD5 == $md5_result ]; then
         echo "Fail to download the language model!"
         return 1
     fi
-- 
GitLab