Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix-244

3bed29dd · yangyaming · d1420d12 · 7cedca4d · d1420d12 · 3bed29dd
63 changed file
--- a/.gitignore
+++ b/.gitignore
-manifest*
-mean_std.npz
-thirdparty/
--- a/README.md
+++ b/README.md
--- a/cloud/pcloud_submit.sh
+++ b/cloud/pcloud_submit.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

 TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
 DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"

--- a/cloud/pcloud_train.sh
+++ b/cloud/pcloud_train.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

 TRAIN_MANIFEST=$1
 DEV_MANIFEST=$2
@@ -15,6 +15,8 @@ python ./cloud/split_data.py \
 --in_manifest_path=${DEV_MANIFEST} \
 --out_manifest_path='/local.manifest.dev'

+mkdir ./logs
+
 python -u train.py \
 --batch_size=${BATCH_SIZE} \
 --trainer_count=${NUM_GPU} \
@@ -35,10 +37,10 @@ python -u train.py \
 --train_manifest='/local.manifest.train' \
 --dev_manifest='/local.manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
 --output_model_dir='./checkpoints' \
 --output_model_dir=${MODEL_PATH} \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped' \
-2>&1 | tee ./log/train.log
+2>&1 | tee ./logs/train.log
--- a/cloud/pcloud_upload_data.sh
+++ b/cloud/pcloud_upload_data.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

 mkdir cloud_manifests


--- a/data/librispeech/eng_vocab.txt
+++ b/data/librispeech/eng_vocab.txt
-'
- 
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@@ -19,8 +19,6 @@ import codecs
 from paddle.v2.dataset.common import md5file
 from data_utils.utility import download, unpack

-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@@ -60,8 +58,7 @@ args = parser.parse_args()


 def create_manifest(data_dir, manifest_path):
-    """
-    Create a manifest json file summarizing the data set, with each line
+    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
@@ -92,8 +89,7 @@ def create_manifest(data_dir, manifest_path):


 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """
-    Download, unpack and create summmary manifest file.
+    """Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
@@ -108,6 +104,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):


 def main():
+    args.target_dir = os.path.expanduser(args.target_dir)
+
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
@@ -118,12 +116,12 @@ def main():
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
-    prepare_dataset(
-        url=URL_TRAIN_CLEAN_100,
-        md5sum=MD5_TRAIN_CLEAN_100,
-        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,

--- a/lm/__init__.py
+++ b/lm/__init__.py
--- a/models/decoder.py
+++ b/models/decoder.py
@@ -42,8 +42,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
 def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            vocabulary,
-                            blank_id,
                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
                            ext_scoring_func=None,
                            nproc=False):
    """CTC Beam search decoder.
@@ -66,8 +66,6 @@ def ctc_beam_search_decoder(probs_seq,
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
@@ -87,9 +85,8 @@ def ctc_beam_search_decoder(probs_seq,
            raise ValueError("The shape of prob_seq does not match with the "
                             "shape of the vocabulary.")

-    # blank_id check
-    if not blank_id < len(probs_seq[0]):
-        raise ValueError("blank_id shouldn't be greater than probs dimension")
+    # blank_id assign
+    blank_id = len(vocabulary)

    # If the decoder called in the multiprocesses, then use the global scorer
    # instantiated in ctc_beam_search_decoder_batch().
@@ -114,7 +111,7 @@ def ctc_beam_search_decoder(probs_seq,
        prob_idx = list(enumerate(probs_seq[time_step]))
        cutoff_len = len(prob_idx)
        #If pruning is enabled
-        if cutoff_prob < 1.0:
+        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
            cutoff_len, cum_prob = 0, 0.0
            for i in xrange(len(prob_idx)):
@@ -122,6 +119,7 @@ def ctc_beam_search_decoder(probs_seq,
                cutoff_len += 1
                if cum_prob >= cutoff_prob:
                    break
+            cutoff_len = min(cutoff_len, cutoff_top_n)
            prob_idx = prob_idx[0:cutoff_len]

        for l in prefix_set_prev:
@@ -180,6 +178,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))

    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
@@ -189,9 +189,9 @@ def ctc_beam_search_decoder(probs_seq,
 def ctc_beam_search_decoder_batch(probs_split,
                                  beam_size,
                                  vocabulary,
-                                  blank_id,
                                  num_processes,
                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
                                  ext_scoring_func=None):
    """CTC beam search decoder using multiple processes.

@@ -202,8 +202,6 @@ def ctc_beam_search_decoder_batch(probs_split,
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in pruning,
@@ -230,8 +228,8 @@ def ctc_beam_search_decoder_batch(probs_split,
    pool = multiprocessing.Pool(processes=num_processes)
    results = []
    for i, probs_list in enumerate(probs_split):
-        args = (probs_list, beam_size, vocabulary, blank_id, cutoff_prob, None,
-                nproc)
+        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
+                None, nproc)
        results.append(pool.apply_async(ctc_beam_search_decoder, args))

    pool.close()

--- a/lm/lm_scorer.py
+++ b/lm/lm_scorer.py
@@ -8,7 +8,7 @@ import kenlm
 import numpy as np


-class LmScorer(object):
+class Scorer(object):
    """External scorer to evaluate a prefix or whole sentence in
       beam search decoding, including the score from n-gram language
       model and word count.

--- a/models/__init__.py
+++ b/models/__init__.py
--- a/deploy/_init_paths.py
+++ b/deploy/_init_paths.py
--- a/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/decoders/swig/ctc_beam_search_decoder.cpp
+#include "ctc_beam_search_decoder.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <utility>
+
+#include "ThreadPool.h"
+#include "fst/fstlib.h"
+
+#include "decoder_utils.h"
+#include "path_trie.h"
+
+using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+
+std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    double cutoff_prob,
+    size_t cutoff_top_n,
+    Scorer *ext_scorer) {
+  // dimension check
+  size_t num_time_steps = probs_seq.size();
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
+  }
+
+  // assign blank id
+  size_t blank_id = vocabulary.size();
+
+  // assign space id
+  auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+  int space_id = it - vocabulary.begin();
+  // if no space in vocabulary
+  if ((size_t)space_id >= vocabulary.size()) {
+    space_id = -2;
+  }
+
+  // init prefixes' root
+  PathTrie root;
+  root.score = root.log_prob_b_prev = 0.0;
+  std::vector<PathTrie *> prefixes;
+  prefixes.push_back(&root);
+
+  if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
+    auto fst_dict = static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
+    fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
+    root.set_dictionary(dict_ptr);
+    auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
+    root.set_matcher(matcher);
+  }
+
+  // prefix search over time
+  for (size_t time_step = 0; time_step < num_time_steps; ++time_step) {
+    auto &prob = probs_seq[time_step];
+
+    float min_cutoff = -NUM_FLT_INF;
+    bool full_beam = false;
+    if (ext_scorer != nullptr) {
+      size_t num_prefixes = std::min(prefixes.size(), beam_size);
+      std::sort(
+          prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
+      min_cutoff = prefixes[num_prefixes - 1]->score +
+                   std::log(prob[blank_id]) - std::max(0.0, ext_scorer->beta);
+      full_beam = (num_prefixes == beam_size);
+    }
+
+    std::vector<std::pair<size_t, float>> log_prob_idx =
+        get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
+    // loop over chars
+    for (size_t index = 0; index < log_prob_idx.size(); index++) {
+      auto c = log_prob_idx[index].first;
+      auto log_prob_c = log_prob_idx[index].second;
+
+      for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) {
+        auto prefix = prefixes[i];
+        if (full_beam && log_prob_c + prefix->score < min_cutoff) {
+          break;
+        }
+        // blank
+        if (c == blank_id) {
+          prefix->log_prob_b_cur =
+              log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score);
+          continue;
+        }
+        // repeated character
+        if (c == prefix->character) {
+          prefix->log_prob_nb_cur = log_sum_exp(
+              prefix->log_prob_nb_cur, log_prob_c + prefix->log_prob_nb_prev);
+        }
+        // get new prefix
+        auto prefix_new = prefix->get_path_trie(c);
+
+        if (prefix_new != nullptr) {
+          float log_p = -NUM_FLT_INF;
+
+          if (c == prefix->character &&
+              prefix->log_prob_b_prev > -NUM_FLT_INF) {
+            log_p = log_prob_c + prefix->log_prob_b_prev;
+          } else if (c != prefix->character) {
+            log_p = log_prob_c + prefix->score;
+          }
+
+          // language model scoring
+          if (ext_scorer != nullptr &&
+              (c == space_id || ext_scorer->is_character_based())) {
+            PathTrie *prefix_toscore = nullptr;
+            // skip scoring the space
+            if (ext_scorer->is_character_based()) {
+              prefix_toscore = prefix_new;
+            } else {
+              prefix_toscore = prefix;
+            }
+
+            double score = 0.0;
+            std::vector<std::string> ngram;
+            ngram = ext_scorer->make_ngram(prefix_toscore);
+            score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
+            log_p += score;
+            log_p += ext_scorer->beta;
+          }
+          prefix_new->log_prob_nb_cur =
+              log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
+        }
+      }  // end of loop over prefix
+    }    // end of loop over vocabulary
+
+    prefixes.clear();
+    // update log probs
+    root.iterate_to_vec(prefixes);
+
+    // only preserve top beam_size prefixes
+    if (prefixes.size() >= beam_size) {
+      std::nth_element(prefixes.begin(),
+                       prefixes.begin() + beam_size,
+                       prefixes.end(),
+                       prefix_compare);
+      for (size_t i = beam_size; i < prefixes.size(); ++i) {
+        prefixes[i]->remove();
+      }
+    }
+  }  // end of loop over time
+
+  // compute aproximate ctc score as the return score, without affecting the
+  // return order of decoding result. To delete when decoder gets stable.
+  for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
+    double approx_ctc = prefixes[i]->score;
+    if (ext_scorer != nullptr) {
+      std::vector<int> output;
+      prefixes[i]->get_path_vec(output);
+      auto prefix_length = output.size();
+      auto words = ext_scorer->split_labels(output);
+      // remove word insert
+      approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
+      // remove language model weight:
+      approx_ctc -= (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
+    }
+    prefixes[i]->approx_ctc = approx_ctc;
+  }
+
+  return get_beam_search_result(prefixes, vocabulary, beam_size);
+}
+
+
+std::vector<std::vector<std::pair<double, std::string>>>
+ctc_beam_search_decoder_batch(
+    const std::vector<std::vector<std::vector<double>>> &probs_split,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    size_t num_processes,
+    double cutoff_prob,
+    size_t cutoff_top_n,
+    Scorer *ext_scorer) {
+  VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
+  // thread pool
+  ThreadPool pool(num_processes);
+  // number of samples
+  size_t batch_size = probs_split.size();
+
+  // enqueue the tasks of decoding
+  std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
+  for (size_t i = 0; i < batch_size; ++i) {
+    res.emplace_back(pool.enqueue(ctc_beam_search_decoder,
+                                  probs_split[i],
+                                  vocabulary,
+                                  beam_size,
+                                  cutoff_prob,
+                                  cutoff_top_n,
+                                  ext_scorer));
+  }
+
+  // get decoding results
+  std::vector<std::vector<std::pair<double, std::string>>> batch_results;
+  for (size_t i = 0; i < batch_size; ++i) {
+    batch_results.emplace_back(res[i].get());
+  }
+  return batch_results;
+}
--- a/decoders/swig/ctc_beam_search_decoder.h
+++ b/decoders/swig/ctc_beam_search_decoder.h
+#ifndef CTC_BEAM_SEARCH_DECODER_H_
+#define CTC_BEAM_SEARCH_DECODER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "scorer.h"
+
+/* CTC Beam Search Decoder
+
+ * Parameters:
+ *     probs_seq: 2-D vector that each element is a vector of probabilities
+ *               over vocabulary of one time step.
+ *     vocabulary: A vector of vocabulary.
+ *     beam_size: The width of beam search.
+ *     cutoff_prob: Cutoff probability for pruning.
+ *     cutoff_top_n: Cutoff number for pruning.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
+ * Return:
+ *     A vector that each element is a pair of score  and decoding result,
+ *     in desending order.
+*/
+std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    double cutoff_prob = 1.0,
+    size_t cutoff_top_n = 40,
+    Scorer *ext_scorer = nullptr);
+
+/* CTC Beam Search Decoder for batch data
+
+ * Parameters:
+ *     probs_seq: 3-D vector that each element is a 2-D vector that can be used
+ *                by ctc_beam_search_decoder().
+ *     vocabulary: A vector of vocabulary.
+ *     beam_size: The width of beam search.
+ *     num_processes: Number of threads for beam search.
+ *     cutoff_prob: Cutoff probability for pruning.
+ *     cutoff_top_n: Cutoff number for pruning.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
+ * Return:
+ *     A 2-D vector that each element is a vector of beam search decoding
+ *     result for one audio sample.
+*/
+std::vector<std::vector<std::pair<double, std::string>>>
+ctc_beam_search_decoder_batch(
+    const std::vector<std::vector<std::vector<double>>> &probs_split,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    size_t num_processes,
+    double cutoff_prob = 1.0,
+    size_t cutoff_top_n = 40,
+    Scorer *ext_scorer = nullptr);
+
+#endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/decoders/swig/ctc_greedy_decoder.cpp
+++ b/decoders/swig/ctc_greedy_decoder.cpp
+#include "ctc_greedy_decoder.h"
+#include "decoder_utils.h"
+
+std::string ctc_greedy_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary) {
+  // dimension check
+  size_t num_time_steps = probs_seq.size();
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
+  }
+
+  size_t blank_id = vocabulary.size();
+
+  std::vector<size_t> max_idx_vec(num_time_steps, 0);
+  std::vector<size_t> idx_vec;
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    double max_prob = 0.0;
+    size_t max_idx = 0;
+    const std::vector<double> &probs_step = probs_seq[i];
+    for (size_t j = 0; j < probs_step.size(); ++j) {
+      if (max_prob < probs_step[j]) {
+        max_idx = j;
+        max_prob = probs_step[j];
+      }
+    }
+    // id with maximum probability in current time step
+    max_idx_vec[i] = max_idx;
+    // deduplicate
+    if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) {
+      idx_vec.push_back(max_idx_vec[i]);
+    }
+  }
+
+  std::string best_path_result;
+  for (size_t i = 0; i < idx_vec.size(); ++i) {
+    if (idx_vec[i] != blank_id) {
+      best_path_result += vocabulary[idx_vec[i]];
+    }
+  }
+  return best_path_result;
+}
--- a/decoders/swig/ctc_greedy_decoder.h
+++ b/decoders/swig/ctc_greedy_decoder.h
+#ifndef CTC_GREEDY_DECODER_H
+#define CTC_GREEDY_DECODER_H
+
+#include <string>
+#include <vector>
+
+/* CTC Greedy (Best Path) Decoder
+ *
+ * Parameters:
+ *     probs_seq: 2-D vector that each element is a vector of probabilities
+ *               over vocabulary of one time step.
+ *     vocabulary: A vector of vocabulary.
+ * Return:
+ *     The decoding result in string
+ */
+std::string ctc_greedy_decoder(
+    const std::vector<std::vector<double>>& probs_seq,
+    const std::vector<std::string>& vocabulary);
+
+#endif  // CTC_GREEDY_DECODER_H
--- a/decoders/swig/decoder_utils.cpp
+++ b/decoders/swig/decoder_utils.cpp
+#include "decoder_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+std::vector<std::pair<size_t, float>> get_pruned_log_probs(
+    const std::vector<double> &prob_step,
+    double cutoff_prob,
+    size_t cutoff_top_n) {
+  std::vector<std::pair<int, double>> prob_idx;
+  for (size_t i = 0; i < prob_step.size(); ++i) {
+    prob_idx.push_back(std::pair<int, double>(i, prob_step[i]));
+  }
+  // pruning of vacobulary
+  size_t cutoff_len = prob_step.size();
+  if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) {
+    std::sort(
+        prob_idx.begin(), prob_idx.end(), pair_comp_second_rev<int, double>);
+    if (cutoff_prob < 1.0) {
+      double cum_prob = 0.0;
+      cutoff_len = 0;
+      for (size_t i = 0; i < prob_idx.size(); ++i) {
+        cum_prob += prob_idx[i].second;
+        cutoff_len += 1;
+        if (cum_prob >= cutoff_prob || cutoff_len >= cutoff_top_n) break;
+      }
+    }
+    prob_idx = std::vector<std::pair<int, double>>(
+        prob_idx.begin(), prob_idx.begin() + cutoff_len);
+  }
+  std::vector<std::pair<size_t, float>> log_prob_idx;
+  for (size_t i = 0; i < cutoff_len; ++i) {
+    log_prob_idx.push_back(std::pair<int, float>(
+        prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN)));
+  }
+  return log_prob_idx;
+}
+
+
+std::vector<std::pair<double, std::string>> get_beam_search_result(
+    const std::vector<PathTrie *> &prefixes,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size) {
+  // allow for the post processing
+  std::vector<PathTrie *> space_prefixes;
+  if (space_prefixes.empty()) {
+    for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
+      space_prefixes.push_back(prefixes[i]);
+    }
+  }
+
+  std::sort(space_prefixes.begin(), space_prefixes.end(), prefix_compare);
+  std::vector<std::pair<double, std::string>> output_vecs;
+  for (size_t i = 0; i < beam_size && i < space_prefixes.size(); ++i) {
+    std::vector<int> output;
+    space_prefixes[i]->get_path_vec(output);
+    // convert index to string
+    std::string output_str;
+    for (size_t j = 0; j < output.size(); j++) {
+      output_str += vocabulary[output[j]];
+    }
+    std::pair<double, std::string> output_pair(-space_prefixes[i]->approx_ctc,
+                                               output_str);
+    output_vecs.emplace_back(output_pair);
+  }
+
+  return output_vecs;
+}
+
+size_t get_utf8_str_len(const std::string &str) {
+  size_t str_len = 0;
+  for (char c : str) {
+    str_len += ((c & 0xc0) != 0x80);
+  }
+  return str_len;
+}
+
+std::vector<std::string> split_utf8_str(const std::string &str) {
+  std::vector<std::string> result;
+  std::string out_str;
+
+  for (char c : str) {
+    if ((c & 0xc0) != 0x80)  // new UTF-8 character
+    {
+      if (!out_str.empty()) {
+        result.push_back(out_str);
+        out_str.clear();
+      }
+    }
+
+    out_str.append(1, c);
+  }
+  result.push_back(out_str);
+  return result;
+}
+
+std::vector<std::string> split_str(const std::string &s,
+                                   const std::string &delim) {
+  std::vector<std::string> result;
+  std::size_t start = 0, delim_len = delim.size();
+  while (true) {
+    std::size_t end = s.find(delim, start);
+    if (end == std::string::npos) {
+      if (start < s.size()) {
+        result.push_back(s.substr(start));
+      }
+      break;
+    }
+    if (end > start) {
+      result.push_back(s.substr(start, end - start));
+    }
+    start = end + delim_len;
+  }
+  return result;
+}
+
+bool prefix_compare(const PathTrie *x, const PathTrie *y) {
+  if (x->score == y->score) {
+    if (x->character == y->character) {
+      return false;
+    } else {
+      return (x->character < y->character);
+    }
+  } else {
+    return x->score > y->score;
+  }
+}
+
+void add_word_to_fst(const std::vector<int> &word,
+                     fst::StdVectorFst *dictionary) {
+  if (dictionary->NumStates() == 0) {
+    fst::StdVectorFst::StateId start = dictionary->AddState();
+    assert(start == 0);
+    dictionary->SetStart(start);
+  }
+  fst::StdVectorFst::StateId src = dictionary->Start();
+  fst::StdVectorFst::StateId dst;
+  for (auto c : word) {
+    dst = dictionary->AddState();
+    dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
+    src = dst;
+  }
+  dictionary->SetFinal(dst, fst::StdArc::Weight::One());
+}
+
+bool add_word_to_dictionary(
+    const std::string &word,
+    const std::unordered_map<std::string, int> &char_map,
+    bool add_space,
+    int SPACE_ID,
+    fst::StdVectorFst *dictionary) {
+  auto characters = split_utf8_str(word);
+
+  std::vector<int> int_word;
+
+  for (auto &c : characters) {
+    if (c == " ") {
+      int_word.push_back(SPACE_ID);
+    } else {
+      auto int_c = char_map.find(c);
+      if (int_c != char_map.end()) {
+        int_word.push_back(int_c->second);
+      } else {
+        return false;  // return without adding
+      }
+    }
+  }
+
+  if (add_space) {
+    int_word.push_back(SPACE_ID);
+  }
+
+  add_word_to_fst(int_word, dictionary);
+  return true;  // return with successful adding
+}
--- a/decoders/swig/decoder_utils.h
+++ b/decoders/swig/decoder_utils.h
+#ifndef DECODER_UTILS_H_
+#define DECODER_UTILS_H_
+
+#include <utility>
+#include "fst/log.h"
+#include "path_trie.h"
+
+const float NUM_FLT_INF = std::numeric_limits<float>::max();
+const float NUM_FLT_MIN = std::numeric_limits<float>::min();
+
+// inline function for validation check
+inline void check(
+    bool x, const char *expr, const char *file, int line, const char *err) {
+  if (!x) {
+    std::cout << "[" << file << ":" << line << "] ";
+    LOG(FATAL) << "\"" << expr << "\" check failed. " << err;
+  }
+}
+
+#define VALID_CHECK(x, info) \
+  check(static_cast<bool>(x), #x, __FILE__, __LINE__, info)
+#define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info)
+#define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info)
+#define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info)
+
+
+// Function template for comparing two pairs
+template <typename T1, typename T2>
+bool pair_comp_first_rev(const std::pair<T1, T2> &a,
+                         const std::pair<T1, T2> &b) {
+  return a.first > b.first;
+}
+
+// Function template for comparing two pairs
+template <typename T1, typename T2>
+bool pair_comp_second_rev(const std::pair<T1, T2> &a,
+                          const std::pair<T1, T2> &b) {
+  return a.second > b.second;
+}
+
+// Return the sum of two probabilities in log scale
+template <typename T>
+T log_sum_exp(const T &x, const T &y) {
+  static T num_min = -std::numeric_limits<T>::max();
+  if (x <= num_min) return y;
+  if (y <= num_min) return x;
+  T xmax = std::max(x, y);
+  return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax;
+}
+
+// Get pruned probability vector for each time step's beam search
+std::vector<std::pair<size_t, float>> get_pruned_log_probs(
+    const std::vector<double> &prob_step,
+    double cutoff_prob,
+    size_t cutoff_top_n);
+
+// Get beam search result from prefixes in trie tree
+std::vector<std::pair<double, std::string>> get_beam_search_result(
+    const std::vector<PathTrie *> &prefixes,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size);
+
+// Functor for prefix comparsion
+bool prefix_compare(const PathTrie *x, const PathTrie *y);
+
+/* Get length of utf8 encoding string
+ * See: http://stackoverflow.com/a/4063229
+ */
+size_t get_utf8_str_len(const std::string &str);
+
+/* Split a string into a list of strings on a given string
+ * delimiter. NB: delimiters on beginning / end of string are
+ * trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"].
+ */
+std::vector<std::string> split_str(const std::string &s,
+                                   const std::string &delim);
+
+/* Splits string into vector of strings representing
+ * UTF-8 characters (not same as chars)
+ */
+std::vector<std::string> split_utf8_str(const std::string &str);
+
+// Add a word in index to the dicionary of fst
+void add_word_to_fst(const std::vector<int> &word,
+                     fst::StdVectorFst *dictionary);
+
+// Add a word in string to dictionary
+bool add_word_to_dictionary(
+    const std::string &word,
+    const std::unordered_map<std::string, int> &char_map,
+    bool add_space,
+    int SPACE_ID,
+    fst::StdVectorFst *dictionary);
+#endif  // DECODER_UTILS_H
--- a/decoders/swig/decoders.i
+++ b/decoders/swig/decoders.i
+%module swig_decoders
+%{
+#include "scorer.h"
+#include "ctc_greedy_decoder.h"
+#include "ctc_beam_search_decoder.h"
+#include "decoder_utils.h"
+%}
+
+%include "std_vector.i"
+%include "std_pair.i"
+%include "std_string.i"
+%import "decoder_utils.h"
+
+namespace std {
+    %template(DoubleVector) std::vector<double>;
+    %template(IntVector) std::vector<int>;
+    %template(StringVector) std::vector<std::string>;
+    %template(VectorOfStructVector) std::vector<std::vector<double> >;
+    %template(FloatVector) std::vector<float>;
+    %template(Pair) std::pair<float, std::string>;
+    %template(PairFloatStringVector)  std::vector<std::pair<float, std::string> >;
+    %template(PairDoubleStringVector) std::vector<std::pair<double, std::string> >;
+    %template(PairDoubleStringVector2) std::vector<std::vector<std::pair<double, std::string> > >;
+    %template(DoubleVector3) std::vector<std::vector<std::vector<double> > >;
+}
+
+%template(IntDoublePairCompSecondRev) pair_comp_second_rev<int, double>;
+%template(StringDoublePairCompSecondRev) pair_comp_second_rev<std::string, double>;
+%template(DoubleStringPairCompFirstRev) pair_comp_first_rev<double, std::string>;
+
+%include "scorer.h"
+%include "ctc_greedy_decoder.h"
+%include "ctc_beam_search_decoder.h"
--- a/decoders/swig/path_trie.cpp
+++ b/decoders/swig/path_trie.cpp
+#include "path_trie.h"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "decoder_utils.h"
+
+PathTrie::PathTrie() {
+  log_prob_b_prev = -NUM_FLT_INF;
+  log_prob_nb_prev = -NUM_FLT_INF;
+  log_prob_b_cur = -NUM_FLT_INF;
+  log_prob_nb_cur = -NUM_FLT_INF;
+  score = -NUM_FLT_INF;
+
+  ROOT_ = -1;
+  character = ROOT_;
+  exists_ = true;
+  parent = nullptr;
+
+  dictionary_ = nullptr;
+  dictionary_state_ = 0;
+  has_dictionary_ = false;
+
+  matcher_ = nullptr;
+}
+
+PathTrie::~PathTrie() {
+  for (auto child : children_) {
+    delete child.second;
+  }
+}
+
+PathTrie* PathTrie::get_path_trie(int new_char, bool reset) {
+  auto child = children_.begin();
+  for (child = children_.begin(); child != children_.end(); ++child) {
+    if (child->first == new_char) {
+      break;
+    }
+  }
+  if (child != children_.end()) {
+    if (!child->second->exists_) {
+      child->second->exists_ = true;
+      child->second->log_prob_b_prev = -NUM_FLT_INF;
+      child->second->log_prob_nb_prev = -NUM_FLT_INF;
+      child->second->log_prob_b_cur = -NUM_FLT_INF;
+      child->second->log_prob_nb_cur = -NUM_FLT_INF;
+    }
+    return (child->second);
+  } else {
+    if (has_dictionary_) {
+      matcher_->SetState(dictionary_state_);
+      bool found = matcher_->Find(new_char);
+      if (!found) {
+        // Adding this character causes word outside dictionary
+        auto FSTZERO = fst::TropicalWeight::Zero();
+        auto final_weight = dictionary_->Final(dictionary_state_);
+        bool is_final = (final_weight != FSTZERO);
+        if (is_final && reset) {
+          dictionary_state_ = dictionary_->Start();
+        }
+        return nullptr;
+      } else {
+        PathTrie* new_path = new PathTrie;
+        new_path->character = new_char;
+        new_path->parent = this;
+        new_path->dictionary_ = dictionary_;
+        new_path->dictionary_state_ = matcher_->Value().nextstate;
+        new_path->has_dictionary_ = true;
+        new_path->matcher_ = matcher_;
+        children_.push_back(std::make_pair(new_char, new_path));
+        return new_path;
+      }
+    } else {
+      PathTrie* new_path = new PathTrie;
+      new_path->character = new_char;
+      new_path->parent = this;
+      children_.push_back(std::make_pair(new_char, new_path));
+      return new_path;
+    }
+  }
+}
+
+PathTrie* PathTrie::get_path_vec(std::vector<int>& output) {
+  return get_path_vec(output, ROOT_);
+}
+
+PathTrie* PathTrie::get_path_vec(std::vector<int>& output,
+                                 int stop,
+                                 size_t max_steps) {
+  if (character == stop || character == ROOT_ || output.size() == max_steps) {
+    std::reverse(output.begin(), output.end());
+    return this;
+  } else {
+    output.push_back(character);
+    return parent->get_path_vec(output, stop, max_steps);
+  }
+}
+
+void PathTrie::iterate_to_vec(std::vector<PathTrie*>& output) {
+  if (exists_) {
+    log_prob_b_prev = log_prob_b_cur;
+    log_prob_nb_prev = log_prob_nb_cur;
+
+    log_prob_b_cur = -NUM_FLT_INF;
+    log_prob_nb_cur = -NUM_FLT_INF;
+
+    score = log_sum_exp(log_prob_b_prev, log_prob_nb_prev);
+    output.push_back(this);
+  }
+  for (auto child : children_) {
+    child.second->iterate_to_vec(output);
+  }
+}
+
+void PathTrie::remove() {
+  exists_ = false;
+
+  if (children_.size() == 0) {
+    auto child = parent->children_.begin();
+    for (child = parent->children_.begin(); child != parent->children_.end();
+         ++child) {
+      if (child->first == character) {
+        parent->children_.erase(child);
+        break;
+      }
+    }
+
+    if (parent->children_.size() == 0 && !parent->exists_) {
+      parent->remove();
+    }
+
+    delete this;
+  }
+}
+
+void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) {
+  dictionary_ = dictionary;
+  dictionary_state_ = dictionary->Start();
+  has_dictionary_ = true;
+}
+
+using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+void PathTrie::set_matcher(std::shared_ptr<FSTMATCH> matcher) {
+  matcher_ = matcher;
+}
--- a/decoders/swig/path_trie.h
+++ b/decoders/swig/path_trie.h
+#ifndef PATH_TRIE_H
+#define PATH_TRIE_H
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "fst/fstlib.h"
+
+/* Trie tree for prefix storing and manipulating, with a dictionary in
+ * finite-state transducer for spelling correction.
+ */
+class PathTrie {
+public:
+  PathTrie();
+  ~PathTrie();
+
+  // get new prefix after appending new char
+  PathTrie* get_path_trie(int new_char, bool reset = true);
+
+  // get the prefix in index from root to current node
+  PathTrie* get_path_vec(std::vector<int>& output);
+
+  // get the prefix in index from some stop node to current nodel
+  PathTrie* get_path_vec(std::vector<int>& output,
+                         int stop,
+                         size_t max_steps = std::numeric_limits<size_t>::max());
+
+  // update log probs
+  void iterate_to_vec(std::vector<PathTrie*>& output);
+
+  // set dictionary for FST
+  void set_dictionary(fst::StdVectorFst* dictionary);
+
+  void set_matcher(std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>>);
+
+  bool is_empty() { return ROOT_ == character; }
+
+  // remove current path from root
+  void remove();
+
+  float log_prob_b_prev;
+  float log_prob_nb_prev;
+  float log_prob_b_cur;
+  float log_prob_nb_cur;
+  float score;
+  float approx_ctc;
+  int character;
+  PathTrie* parent;
+
+private:
+  int ROOT_;
+  bool exists_;
+  bool has_dictionary_;
+
+  std::vector<std::pair<int, PathTrie*>> children_;
+
+  // pointer to dictionary of FST
+  fst::StdVectorFst* dictionary_;
+  fst::StdVectorFst::StateId dictionary_state_;
+  // true if finding ars in FST
+  std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>> matcher_;
+};
+
+#endif  // PATH_TRIE_H
--- a/decoders/swig/scorer.cpp
+++ b/decoders/swig/scorer.cpp
+#include "scorer.h"
+
+#include <unistd.h>
+#include <iostream>
+
+#include "lm/config.hh"
+#include "lm/model.hh"
+#include "lm/state.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "decoder_utils.h"
+
+using namespace lm::ngram;
+
+Scorer::Scorer(double alpha,
+               double beta,
+               const std::string& lm_path,
+               const std::vector<std::string>& vocab_list) {
+  this->alpha = alpha;
+  this->beta = beta;
+
+  dictionary = nullptr;
+  is_character_based_ = true;
+  language_model_ = nullptr;
+
+  max_order_ = 0;
+  dict_size_ = 0;
+  SPACE_ID_ = -1;
+
+  setup(lm_path, vocab_list);
+}
+
+Scorer::~Scorer() {
+  if (language_model_ != nullptr) {
+    delete static_cast<lm::base::Model*>(language_model_);
+  }
+  if (dictionary != nullptr) {
+    delete static_cast<fst::StdVectorFst*>(dictionary);
+  }
+}
+
+void Scorer::setup(const std::string& lm_path,
+                   const std::vector<std::string>& vocab_list) {
+  // load language model
+  load_lm(lm_path);
+  // set char map for scorer
+  set_char_map(vocab_list);
+  // fill the dictionary for FST
+  if (!is_character_based()) {
+    fill_dictionary(true);
+  }
+}
+
+void Scorer::load_lm(const std::string& lm_path) {
+  const char* filename = lm_path.c_str();
+  VALID_CHECK_EQ(access(filename, F_OK), 0, "Invalid language model path");
+
+  RetriveStrEnumerateVocab enumerate;
+  lm::ngram::Config config;
+  config.enumerate_vocab = &enumerate;
+  language_model_ = lm::ngram::LoadVirtual(filename, config);
+  max_order_ = static_cast<lm::base::Model*>(language_model_)->Order();
+  vocabulary_ = enumerate.vocabulary;
+  for (size_t i = 0; i < vocabulary_.size(); ++i) {
+    if (is_character_based_ && vocabulary_[i] != UNK_TOKEN &&
+        vocabulary_[i] != START_TOKEN && vocabulary_[i] != END_TOKEN &&
+        get_utf8_str_len(enumerate.vocabulary[i]) > 1) {
+      is_character_based_ = false;
+    }
+  }
+}
+
+double Scorer::get_log_cond_prob(const std::vector<std::string>& words) {
+  lm::base::Model* model = static_cast<lm::base::Model*>(language_model_);
+  double cond_prob;
+  lm::ngram::State state, tmp_state, out_state;
+  // avoid to inserting <s> in begin
+  model->NullContextWrite(&state);
+  for (size_t i = 0; i < words.size(); ++i) {
+    lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]);
+    // encounter OOV
+    if (word_index == 0) {
+      return OOV_SCORE;
+    }
+    cond_prob = model->BaseScore(&state, word_index, &out_state);
+    tmp_state = state;
+    state = out_state;
+    out_state = tmp_state;
+  }
+  // return  log10 prob
+  return cond_prob;
+}
+
+double Scorer::get_sent_log_prob(const std::vector<std::string>& words) {
+  std::vector<std::string> sentence;
+  if (words.size() == 0) {
+    for (size_t i = 0; i < max_order_; ++i) {
+      sentence.push_back(START_TOKEN);
+    }
+  } else {
+    for (size_t i = 0; i < max_order_ - 1; ++i) {
+      sentence.push_back(START_TOKEN);
+    }
+    sentence.insert(sentence.end(), words.begin(), words.end());
+  }
+  sentence.push_back(END_TOKEN);
+  return get_log_prob(sentence);
+}
+
+double Scorer::get_log_prob(const std::vector<std::string>& words) {
+  assert(words.size() > max_order_);
+  double score = 0.0;
+  for (size_t i = 0; i < words.size() - max_order_ + 1; ++i) {
+    std::vector<std::string> ngram(words.begin() + i,
+                                   words.begin() + i + max_order_);
+    score += get_log_cond_prob(ngram);
+  }
+  return score;
+}
+
+void Scorer::reset_params(float alpha, float beta) {
+  this->alpha = alpha;
+  this->beta = beta;
+}
+
+std::string Scorer::vec2str(const std::vector<int>& input) {
+  std::string word;
+  for (auto ind : input) {
+    word += char_list_[ind];
+  }
+  return word;
+}
+
+std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
+  if (labels.empty()) return {};
+
+  std::string s = vec2str(labels);
+  std::vector<std::string> words;
+  if (is_character_based_) {
+    words = split_utf8_str(s);
+  } else {
+    words = split_str(s, " ");
+  }
+  return words;
+}
+
+void Scorer::set_char_map(const std::vector<std::string>& char_list) {
+  char_list_ = char_list;
+  char_map_.clear();
+
+  for (size_t i = 0; i < char_list_.size(); i++) {
+    if (char_list_[i] == " ") {
+      SPACE_ID_ = i;
+      char_map_[' '] = i;
+    } else if (char_list_[i].size() == 1) {
+      char_map_[char_list_[i][0]] = i;
+    }
+  }
+}
+
+std::vector<std::string> Scorer::make_ngram(PathTrie* prefix) {
+  std::vector<std::string> ngram;
+  PathTrie* current_node = prefix;
+  PathTrie* new_node = nullptr;
+
+  for (int order = 0; order < max_order_; order++) {
+    std::vector<int> prefix_vec;
+
+    if (is_character_based_) {
+      new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1);
+      current_node = new_node;
+    } else {
+      new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_);
+      current_node = new_node->parent;  // Skipping spaces
+    }
+
+    // reconstruct word
+    std::string word = vec2str(prefix_vec);
+    ngram.push_back(word);
+
+    if (new_node->character == -1) {
+      // No more spaces, but still need order
+      for (int i = 0; i < max_order_ - order - 1; i++) {
+        ngram.push_back(START_TOKEN);
+      }
+      break;
+    }
+  }
+  std::reverse(ngram.begin(), ngram.end());
+  return ngram;
+}
+
+void Scorer::fill_dictionary(bool add_space) {
+  fst::StdVectorFst dictionary;
+  // First reverse char_list so ints can be accessed by chars
+  std::unordered_map<std::string, int> char_map;
+  for (size_t i = 0; i < char_list_.size(); i++) {
+    char_map[char_list_[i]] = i;
+  }
+
+  // For each unigram convert to ints and put in trie
+  int dict_size = 0;
+  for (const auto& word : vocabulary_) {
+    bool added = add_word_to_dictionary(
+        word, char_map, add_space, SPACE_ID_, &dictionary);
+    dict_size += added ? 1 : 0;
+  }
+
+  dict_size_ = dict_size;
+
+  /* Simplify FST
+
+   * This gets rid of "epsilon" transitions in the FST.
+   * These are transitions that don't require a string input to be taken.
+   * Getting rid of them is necessary to make the FST determinisitc, but
+   * can greatly increase the size of the FST
+   */
+  fst::RmEpsilon(&dictionary);
+  fst::StdVectorFst* new_dict = new fst::StdVectorFst;
+
+  /* This makes the FST deterministic, meaning for any string input there's
+   * only one possible state the FST could be in.  It is assumed our
+   * dictionary is deterministic when using it.
+   * (lest we'd have to check for multiple transitions at each state)
+   */
+  fst::Determinize(dictionary, new_dict);
+
+  /* Finds the simplest equivalent fst. This is unnecessary but decreases
+   * memory usage of the dictionary
+   */
+  fst::Minimize(new_dict);
+  this->dictionary = new_dict;
+}
--- a/decoders/swig/scorer.h
+++ b/decoders/swig/scorer.h
+#ifndef SCORER_H_
+#define SCORER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "lm/enumerate_vocab.hh"
+#include "lm/virtual_interface.hh"
+#include "lm/word_index.hh"
+#include "util/string_piece.hh"
+
+#include "path_trie.h"
+
+const double OOV_SCORE = -1000.0;
+const std::string START_TOKEN = "<s>";
+const std::string UNK_TOKEN = "<unk>";
+const std::string END_TOKEN = "</s>";
+
+// Implement a callback to retrive the dictionary of language model.
+class RetriveStrEnumerateVocab : public lm::EnumerateVocab {
+public:
+  RetriveStrEnumerateVocab() {}
+
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    vocabulary.push_back(std::string(str.data(), str.length()));
+  }
+
+  std::vector<std::string> vocabulary;
+};
+
+/* External scorer to query score for n-gram or sentence, including language
+ * model scoring and word insertion.
+ *
+ * Example:
+ *     Scorer scorer(alpha, beta, "path_of_language_model");
+ *     scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });
+ *     scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });
+ */
+class Scorer {
+public:
+  Scorer(double alpha,
+         double beta,
+         const std::string &lm_path,
+         const std::vector<std::string> &vocabulary);
+  ~Scorer();
+
+  double get_log_cond_prob(const std::vector<std::string> &words);
+
+  double get_sent_log_prob(const std::vector<std::string> &words);
+
+  // return the max order
+  size_t get_max_order() const { return max_order_; }
+
+  // return the dictionary size of language model
+  size_t get_dict_size() const { return dict_size_; }
+
+  // retrun true if the language model is character based
+  bool is_character_based() const { return is_character_based_; }
+
+  // reset params alpha & beta
+  void reset_params(float alpha, float beta);
+
+  // make ngram for a given prefix
+  std::vector<std::string> make_ngram(PathTrie *prefix);
+
+  // trransform the labels in index to the vector of words (word based lm) or
+  // the vector of characters (character based lm)
+  std::vector<std::string> split_labels(const std::vector<int> &labels);
+
+  // language model weight
+  double alpha;
+  // word insertion weight
+  double beta;
+
+  // pointer to the dictionary of FST
+  void *dictionary;
+
+protected:
+  // necessary setup: load language model, set char map, fill FST's dictionary
+  void setup(const std::string &lm_path,
+             const std::vector<std::string> &vocab_list);
+
+  // load language model from given path
+  void load_lm(const std::string &lm_path);
+
+  // fill dictionary for FST
+  void fill_dictionary(bool add_space);
+
+  // set char map
+  void set_char_map(const std::vector<std::string> &char_list);
+
+  double get_log_prob(const std::vector<std::string> &words);
+
+  // translate the vector in index to string
+  std::string vec2str(const std::vector<int> &input);
+
+private:
+  void *language_model_;
+  bool is_character_based_;
+  size_t max_order_;
+  size_t dict_size_;
+
+  int SPACE_ID_;
+  std::vector<std::string> char_list_;
+  std::unordered_map<char, int> char_map_;
+
+  std::vector<std::string> vocabulary_;
+};
+
+#endif  // SCORER_H_
--- a/decoders/swig/setup.py
+++ b/decoders/swig/setup.py
+"""Script to build and install decoder package."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from setuptools import setup, Extension, distutils
+import glob
+import platform
+import os, sys
+import multiprocessing.pool
+import argparse
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--num_processes",
+    default=1,
+    type=int,
+    help="Number of cpu processes to build package. (default: %(default)d)")
+args = parser.parse_known_args()
+
+# reconstruct sys.argv to pass to setup below
+sys.argv = [sys.argv[0]] + args[1]
+
+
+# monkey-patch for parallel compilation
+# See: https://stackoverflow.com/a/13176803
+def parallelCCompile(self,
+                     sources,
+                     output_dir=None,
+                     macros=None,
+                     include_dirs=None,
+                     debug=0,
+                     extra_preargs=None,
+                     extra_postargs=None,
+                     depends=None):
+    # those lines are copied from distutils.ccompiler.CCompiler directly
+    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
+        output_dir, macros, include_dirs, sources, depends, extra_postargs)
+    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
+
+    # parallel code
+    def _single_compile(obj):
+        try:
+            src, ext = build[obj]
+        except KeyError:
+            return
+        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+
+    # convert to list, imap is evaluated on-demand
+    thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes)
+    list(thread_pool.imap(_single_compile, objects))
+    return objects
+
+
+def compile_test(header, library):
+    dummy_path = os.path.join(os.path.dirname(__file__), "dummy")
+    command = "bash -c \"g++ -include " + header \
+                + " -l" + library + " -x c++ - <<<'int main() {}' -o " \
+                + dummy_path + " >/dev/null 2>/dev/null && rm " \
+                + dummy_path + " 2>/dev/null\""
+    return os.system(command) == 0
+
+
+# hack compile to support parallel compiling
+distutils.ccompiler.CCompiler.compile = parallelCCompile
+
+FILES = glob.glob('kenlm/util/*.cc') \
+        + glob.glob('kenlm/lm/*.cc') \
+        + glob.glob('kenlm/util/double-conversion/*.cc')
+
+FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
+
+# FILES + glob.glob('glog/src/*.cc')
+FILES = [
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
+]
+
+LIBS = ['stdc++']
+if platform.system() != 'Darwin':
+    LIBS.append('rt')
+
+ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11']
+
+if compile_test('zlib.h', 'z'):
+    ARGS.append('-DHAVE_ZLIB')
+    LIBS.append('z')
+
+if compile_test('bzlib.h', 'bz2'):
+    ARGS.append('-DHAVE_BZLIB')
+    LIBS.append('bz2')
+
+if compile_test('lzma.h', 'lzma'):
+    ARGS.append('-DHAVE_XZLIB')
+    LIBS.append('lzma')
+
+os.system('swig -python -c++ ./decoders.i')
+
+decoders_module = [
+    Extension(
+        name='_swig_decoders',
+        sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'),
+        language='c++',
+        include_dirs=[
+            '.',
+            'kenlm',
+            'openfst-1.6.3/src/include',
+            'ThreadPool',
+            #'glog/src'
+        ],
+        libraries=LIBS,
+        extra_compile_args=ARGS)
+]
+
+setup(
+    name='swig_decoders',
+    version='0.1',
+    description="""CTC decoders""",
+    ext_modules=decoders_module,
+    py_modules=['swig_decoders'], )
--- a/decoders/swig/setup.sh
+++ b/decoders/swig/setup.sh
+#!/usr/bin/env bash
+
+if [ ! -d kenlm ]; then
+    git clone https://github.com/luotao1/kenlm.git
+    echo -e "\n"
+fi
+
+if [ ! -d openfst-1.6.3 ]; then
+    echo "Download and extract openfst ..."
+    wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz
+    tar -xzvf openfst-1.6.3.tar.gz
+    echo -e "\n"
+fi
+
+if [ ! -d ThreadPool ]; then
+    git clone https://github.com/progschj/ThreadPool.git
+    echo -e "\n"
+fi
+
+echo "Install decoders ..."
+python setup.py install --num_processes 4
--- a/decoders/swig_wrapper.py
+++ b/decoders/swig_wrapper.py
+"""Wrapper for various CTC decoders in SWIG."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import swig_decoders
+
+
+class Scorer(swig_decoders.Scorer):
+    """Wrapper for Scorer.
+
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                 count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: basestring
+    """
+
+    def __init__(self, alpha, beta, model_path, vocabulary):
+        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
+
+
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """Wrapper for ctc best path decoder in swig.
+
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: basestring
+    """
+    return swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
+
+
+def ctc_beam_search_decoder(probs_seq,
+                            vocabulary,
+                            beam_size,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None):
+    """Wrapper for the CTC Beam Search Decoder.
+
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_func: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    return swig_decoders.ctc_beam_search_decoder(probs_seq.tolist(), vocabulary,
+                                                 beam_size, cutoff_prob,
+                                                 cutoff_top_n, ext_scoring_func)
+
+
+def ctc_beam_search_decoder_batch(probs_split,
+                                  vocabulary,
+                                  beam_size,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None):
+    """Wrapper for the batched CTC beam search decoder.
+
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in vocabulary pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
+
+    return swig_decoders.ctc_beam_search_decoder_batch(
+        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func)
--- a/models/tests/test_decoders.py
+++ b/models/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-from models import decoder
+from decoders import decoders_deprecated as decoder


 class TestDecoders(unittest.TestCase):
@@ -66,16 +66,14 @@ class TestDecoders(unittest.TestCase):
        beam_result = decoder.ctc_beam_search_decoder(
            probs_seq=self.probs_seq1,
            beam_size=self.beam_size,
-            vocabulary=self.vocab_list,
-            blank_id=len(self.vocab_list))
+            vocabulary=self.vocab_list)
        self.assertEqual(beam_result[0][1], self.beam_search_result[0])

    def test_beam_search_decoder_2(self):
        beam_result = decoder.ctc_beam_search_decoder(
            probs_seq=self.probs_seq2,
            beam_size=self.beam_size,
-            vocabulary=self.vocab_list,
-            blank_id=len(self.vocab_list))
+            vocabulary=self.vocab_list)
        self.assertEqual(beam_result[0][1], self.beam_search_result[1])

    def test_beam_search_decoder_batch(self):
@@ -83,7 +81,6 @@ class TestDecoders(unittest.TestCase):
            probs_split=[self.probs_seq1, self.probs_seq2],
            beam_size=self.beam_size,
            vocabulary=self.vocab_list,
-            blank_id=len(self.vocab_list),
            num_processes=24)
        self.assertEqual(beam_results[0][0][1], self.beam_search_result[0])
        self.assertEqual(beam_results[1][0][1], self.beam_search_result[1])

--- a/deploy/demo_server.py
+++ b/deploy/demo_server.py
@@ -11,7 +11,7 @@ import wave
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.utils import read_manifest
 from utils.utility import add_arguments, print_arguments

@@ -46,7 +46,7 @@ add_arg('vocab_path',       str,
        'data/librispeech/eng_vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
@@ -100,7 +100,7 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler):
        finish_time = time.time()
        print("Response Time: %f, Transcript: %s" %
              (finish_time - start_time, transcript))
-        self.request.sendall(transcript)
+        self.request.sendall(transcript.encode('utf-8'))

    def _write_to_file(self, data):
        # prepare save dir and filename

--- a/docs/images/multi_gpu_speedup.png
+++ b/docs/images/multi_gpu_speedup.png
--- a/examples/librispeech/prepare_data.sh
+++ b/examples/librispeech/prepare_data.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

-pushd ../..
+pushd ../.. > /dev/null

 # download data, generate manifests
 python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri'
+--target_dir='~/.cache/paddle/dataset/speech/Libri' \
+--full_download='True'

 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi

-#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
+cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
+
+
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/librispeech/vocab.txt' \
+--manifest_paths='data/librispeech/manifest.train'
+
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi


 # compute mean and stddev for normalizer
@@ -30,3 +42,4 @@ fi


 echo "LibriSpeech Data preparation done."
+exit 0
--- a/examples/librispeech/generate.sh
+++ b/examples/librispeech/generate.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
 --num_samples=10 \
 --trainer_count=1 \
 --beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
+--num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
+--infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/run_infer_golden.sh
+++ b/examples/librispeech/run_infer_golden.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/run_test.sh
+++ b/examples/librispeech/run_test.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python -u evaluate.py \
+python -u test.py \
 --batch_size=128 \
 --trainer_count=8 \
 --beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/run_test_golden.sh
+++ b/examples/librispeech/run_test_golden.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/run_train.sh
+++ b/examples/librispeech/run_train.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

-pushd ../..
+pushd ../.. > /dev/null

+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u train.py \
--batch_size=256 \
+--batch_size=512 \
 --trainer_count=8 \
--num_passes=200 \
+--num_passes=50 \
 --num_proc_data=12 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
@@ -15,6 +17,7 @@ python -u train.py \
 --learning_rate=5e-4 \
 --max_duration=27.0 \
 --min_duration=0.0 \
+--test_off=False \
 --use_sortagrad=True \
 --use_gru=False \
 --use_gpu=True \
@@ -23,8 +26,16 @@ python -u train.py \
 --train_manifest='data/librispeech/manifest.train' \
 --dev_manifest='data/librispeech/manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
+--vocab_path='data/librispeech/vocab.txt' \
+--output_model_dir='./checkpoints/libri' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/run_tune.sh
+++ b/examples/librispeech/run_tune.sh
-#! /usr/bin/bash
+#! /usr/bin/env bash

-pushd ../..
+pushd ../.. > /dev/null

+# grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u tools/tune.py \
 --num_samples=100 \
@@ -23,8 +24,16 @@ python -u tools/tune.py \
 --share_rnn_weights=True \
 --tune_manifest='data/librispeech/manifest.dev-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/mandarin/run_demo_client.sh
+++ b/examples/mandarin/run_demo_client.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# start demo client
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_client.py \
+--host_ip='localhost' \
+--host_port=8086 \
+
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo client!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/mandarin/run_demo_server.sh
+++ b/examples/mandarin/run_demo_server.sh
+#! /usr/bin/env bash
+# TODO: replace the model with a mandarin model
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# start demo server
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_server.py \
+--host_ip='localhost' \
+--host_port=8086 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--speech_save_dir='demo_cache' \
+--warmup_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo server!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_data.sh
+++ b/examples/tiny/run_data.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# prepare folder
+if [ ! -e data/tiny ]; then
+    mkdir data/tiny
+fi
+
+
+# download data, generate manifests
+python data/librispeech/librispeech.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/libri' \
+--full_download='False'
+
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+
+head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+
+
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.dev'
+
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+
+
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.tiny' \
+--num_samples=64 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+
+
+echo "Tiny data preparation done."
+exit 0
--- a/examples/tiny/run_infer.sh
+++ b/examples/tiny/run_infer.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_infer_golden.sh
+++ b/examples/tiny/run_infer_golden.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_test.sh
+++ b/examples/tiny/run_test.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=16 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_test_golden.sh
+++ b/examples/tiny/run_test_golden.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_train.sh
+++ b/examples/tiny/run_train.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python -u train.py \
+--batch_size=16 \
+--trainer_count=4 \
+--num_passes=20 \
+--num_proc_data=1 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=1e-5 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--test_off=False \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--output_model_dir='./checkpoints/tiny' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+
+if [ $? -ne 0 ]; then
+    echo "Fail to do inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/tiny/run_tune.sh
+++ b/examples/tiny/run_tune.sh
+#! /usr/bin/env bash
+
+pushd ../.. > /dev/null
+
+# grid-search for hyper-parameters in language model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-9.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+
+
+exit 0
--- a/infer.py
+++ b/infer.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments

@@ -21,9 +21,10 @@ add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
-add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
-add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('alpha',            float,  2.15,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.35,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
+add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
@@ -35,13 +36,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('decoding_method',  str,
@@ -84,6 +85,10 @@ def infer():
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)
+
+    # decoders only accept string encoded in utf-8
+    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
+
    result_transcripts = ds2_model.infer_batch(
        infer_data=infer_data,
        decoding_method=args.decoding_method,
@@ -91,7 +96,8 @@ def infer():
        beam_beta=args.beta,
        beam_size=args.beam_size,
        cutoff_prob=args.cutoff_prob,
-        vocab_list=data_generator.vocab_list,
+        cutoff_top_n=args.cutoff_top_n,
+        vocab_list=vocab_list,
        language_model_path=args.lang_model_path,
        num_processes=args.num_proc_bsearch)

@@ -106,6 +112,7 @@ def infer():
        print("Current error rate [%s] = %f" %
              (args.error_rate_type, error_rate_func(target, result)))

+    ds2_model.logger.info("finish inference")

 def main():
    print_arguments(args)

--- a/lm/run.sh
+++ b/lm/run.sh
-echo "Downloading language model ..."
-
-mkdir data
-
-LM=common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-
-wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
-
-echo "Checking md5sum ..."
-md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
-
-if [ $MD5 != $md5_tmp ]; then
-    echo "Fail to download the language model!"
-    exit 1
-fi
-
-
-
--- a/model_utils/__init__.py
+++ b/model_utils/__init__.py
--- a/models/model.py
+++ b/models/model.py
@@ -6,12 +6,17 @@ from __future__ import print_function
 import sys
 import os
 import time
+import logging
 import gzip
+from distutils.dir_util import mkpath
 import paddle.v2 as paddle
-from lm.lm_scorer import LmScorer
-from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
-from models.decoder import ctc_beam_search_decoder_batch
-from models.network import deep_speech_v2_network
+from decoders.swig_wrapper import Scorer
+from decoders.swig_wrapper import ctc_greedy_decoder
+from decoders.swig_wrapper import ctc_beam_search_decoder_batch
+from model_utils.network import deep_speech_v2_network
+
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')


 class DeepSpeech2Model(object):
@@ -43,6 +48,8 @@ class DeepSpeech2Model(object):
        self._inferer = None
        self._loss_inferer = None
        self._ext_scorer = None
+        self.logger = logging.getLogger("")
+        self.logger.setLevel(level=logging.INFO)

    def train(self,
              train_batch_reader,
@@ -53,7 +60,8 @@ class DeepSpeech2Model(object):
              num_passes,
              output_model_dir,
              is_local=True,
-              num_iterations_print=100):
+              num_iterations_print=100,
+              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
@@ -76,10 +84,12 @@ class DeepSpeech2Model(object):
        :type is_local: bool
        :param output_model_dir: Directory for saving the model (every pass).
        :type output_model_dir: basestring
+        :param test_off: Turn off testing.
+        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(output_model_dir):
-            os.mkdir(output_model_dir)
+            mkpath(output_model_dir)

        # prepare optimizer and trainer
        optimizer = paddle.optimizer.Adam(
@@ -113,14 +123,19 @@ class DeepSpeech2Model(object):
                start_time = time.time()
                cost_sum, cost_counter = 0.0, 0
            if isinstance(event, paddle.event.EndPass):
-                result = trainer.test(
-                    reader=dev_batch_reader, feeding=feeding_dict)
+                if test_off:
+                    print("\n------- Time: %d sec,  Pass: %d" %
+                          (time.time() - start_time, event.pass_id))
+                else:
+                    result = trainer.test(
+                        reader=dev_batch_reader, feeding=feeding_dict)
+                    print("\n------- Time: %d sec,  Pass: %d, "
+                          "ValidationCost: %s" %
+                          (time.time() - start_time, event.pass_id, 0))
                output_model_path = os.path.join(
                    output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
                with gzip.open(output_model_path, 'w') as f:
                    self._parameters.to_tar(f)
-                print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
-                      (time.time() - start_time, event.pass_id, result.cost))

        # run train
        trainer.train(
@@ -148,8 +163,8 @@ class DeepSpeech2Model(object):
        return self._loss_inferer.infer(input=infer_data)

    def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
-                    beam_size, cutoff_prob, vocab_list, language_model_path,
-                    num_processes):
+                    beam_size, cutoff_prob, cutoff_top_n, vocab_list,
+                    language_model_path, num_processes):
        """Model inference. Infer the transcription for a batch of speech
        utterances.

@@ -169,6 +184,10 @@ class DeepSpeech2Model(object):
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
+        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                        characters with highest probs in vocabulary will be
+                        used in beam search, default 40.
+        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param language_model_path: Filepath for language model.
@@ -200,21 +219,33 @@ class DeepSpeech2Model(object):
        elif decoding_method == "ctc_beam_search":
            # initialize external scorer
            if self._ext_scorer == None:
-                self._ext_scorer = LmScorer(beam_alpha, beam_beta,
-                                            language_model_path)
                self._loaded_lm_path = language_model_path
+                self.logger.info("begin to initialize the external scorer "
+                                 "for decoding")
+                self._ext_scorer = Scorer(beam_alpha, beam_beta,
+                                          language_model_path, vocab_list)
+
+                lm_char_based = self._ext_scorer.is_character_based()
+                lm_max_order = self._ext_scorer.get_max_order()
+                lm_dict_size = self._ext_scorer.get_dict_size()
+                self.logger.info("language model: "
+                                 "is_character_based = %d," % lm_char_based +
+                                 " max_order = %d," % lm_max_order +
+                                 " dict_size = %d" % lm_dict_size)
+                self.logger.info("end initializing scorer. Start decoding ...")
            else:
                self._ext_scorer.reset_params(beam_alpha, beam_beta)
                assert self._loaded_lm_path == language_model_path
            # beam search decode
+            num_processes = min(num_processes, len(probs_split))
            beam_search_results = ctc_beam_search_decoder_batch(
                probs_split=probs_split,
                vocabulary=vocab_list,
                beam_size=beam_size,
-                blank_id=len(vocab_list),
                num_processes=num_processes,
                ext_scoring_func=self._ext_scorer,
-                cutoff_prob=cutoff_prob)
+                cutoff_prob=cutoff_prob,
+                cutoff_top_n=cutoff_top_n)

            results = [result[0][1] for result in beam_search_results]
        else:

--- a/models/network.py
+++ b/models/network.py
--- a/models/aishell/download_model.sh
+++ b/models/aishell/download_model.sh
+#! /usr/bin/env bash
+
+source ../../utils/utility.sh
+
+URL='http://cloud.dlnel.org/filepub/?uuid=6c83b9d8-3255-4adf-9726-0fe0be3d0274'
+MD5=28521a58552885a81cf92a1e9b133a71
+TARGET=./aishell_model.tar.gz
+
+
+echo "Download Aishell model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download Aishell model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+
+
+exit 0
--- a/models/librispeech/download_model.sh
+++ b/models/librispeech/download_model.sh
+#! /usr/bin/env bash
+
+source ../../utils/utility.sh
+
+URL='http://cloud.dlnel.org/filepub/?uuid=17404caf-cf19-492f-9707-1fad07c19aae'
+MD5=ea5024a457a91179472f6dfee60e053d
+TARGET=./librispeech_model.tar.gz
+
+
+echo "Download LibriSpeech model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download LibriSpeech model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+
+
+exit 0
--- a/models/lm/download_lm_ch.sh
+++ b/models/lm/download_lm_ch.sh
+#! /usr/bin/env bash
+
+source ../../utils/utility.sh
+
+URL=http://cloud.dlnel.org/filepub/?uuid=d21861e4-4ed6-45bb-ad8e-ae417a43195e
+MD5="29e02312deb2e59b3c8686c7966d4fe3"
+TARGET=./zh_giga.no_cna_cmn.prune01244.klm
+
+
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+
+
+exit 0
--- a/models/lm/download_lm_en.sh
+++ b/models/lm/download_lm_en.sh
+#! /usr/bin/env bash
+
+source ../../utils/utility.sh
+
+URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=./common_crawl_00.prune01111.trie.klm
+
+
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+
+
+exit 0
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,3 @@ scipy==0.13.1
 resampy==0.1.5
 SoundFile==0.9.0.post1
 python_speech_features
-https://github.com/luotao1/kenlm/archive/master.zip
--- a/setup.sh
+++ b/setup.sh
-#!/bin/bash
+#! /usr/bin/env  bash

 # install python dependencies
 if [ -f "requirements.txt" ]; then
@@ -20,10 +20,19 @@ if [ $? != 0 ]; then
    fi
    tar -zxvf libsndfile-1.0.28.tar.gz
    cd libsndfile-1.0.28
-    ./configure && make && make install
+    ./configure > /dev/null && make > /dev/null && make install > /dev/null
    cd ..
    rm -rf libsndfile-1.0.28
    rm libsndfile-1.0.28.tar.gz
 fi

+# install decoders
+python -c "import swig_decoders"
+if [ $? != 0 ]; then
+    cd decoders/swig > /dev/null
+    sh setup.sh
+    cd - > /dev/null
+fi
+
+
 echo "Install all dependencies successfully."
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments

@@ -22,9 +22,10 @@ add_arg('num_proc_data',    int,    12,     "# of CPUs for data preprocessing.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
-add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
-add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('alpha',            float,  2.15,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.35,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
+add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
@@ -36,14 +37,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',
@@ -85,6 +86,9 @@ def evaluate():
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

+    # decoders only accept string encoded in utf-8
+    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
+
    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    error_sum, num_ins = 0.0, 0
    for infer_data in batch_reader():
@@ -95,7 +99,8 @@ def evaluate():
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
-            vocab_list=data_generator.vocab_list,
+            cutoff_top_n=args.cutoff_top_n,
+            vocab_list=vocab_list,
            language_model_path=args.lang_model_path,
            num_processes=args.num_proc_bsearch)
        target_transcripts = [
@@ -110,6 +115,7 @@ def evaluate():
    print("Final error rate [%s] (%d/%d) = %f" %
          (args.error_rate_type, num_ins, num_ins, error_sum / num_ins))

+    ds2_model.logger.info("finish evaluation")

 def main():
    print_arguments(args)

--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        'datasets/vocab/zh_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath to write the vocabulary.")
 add_arg('manifest_paths',   str,
        None,
@@ -34,7 +34,7 @@ args = parser.parse_args()


 def count_manifest(counter, manifest_path):
-    manifest_jsons = utils.read_manifest(manifest_path)
+    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
        for char in line_json['text']:
            counter.update(char)

--- a/tools/compute_mean_std.py
+++ b/tools/compute_mean_std.py
@@ -20,10 +20,10 @@ add_arg('specgram_type',    str,
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 add_arg('manifest_path',    str,
-        'datasets/manifest.train',
+        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
 add_arg('output_path',    str,
-        'mean_std.npz',
+        'data/librispeech/mean_std.npz',
        "Filepath of write mean and stddev to (.npz).")
 # yapf: disable
 args = parser.parse_args()

--- a/tools/profile.sh
+++ b/tools/profile.sh
+#! /usr/bin/env bash
+
+BATCH_SIZE_PER_GPU=64
+MIN_DURATION=6.0
+MAX_DURATION=7.0
+
+function join_by { local IFS="$1"; shift; echo "$*"; }
+
+for NUM_GPUS in 16 8 4 2 1
+do
+  DEVICES=$(join_by , $(seq 0 $(($NUM_GPUS-1))))
+  BATCH_SIZE=$(($BATCH_SIZE_PER_GPU * $NUM_GPUS))
+
+  CUDA_VISIBLE_DEVICES=$DEVICES \
+  python train.py \
+  --batch_size=$BATCH_SIZE \
+  --num_passes=1 \
+  --test_off=True \
+  --trainer_count=$NUM_GPUS \
+  --min_duration=$MIN_DURATION \
+  --max_duration=$MAX_DURATION > tmp.log 2>&1
+
+  if [ $? -ne 0 ];then
+      exit 1
+  fi
+
+  cat tmp.log  | grep "Time" | awk '{print "GPU Num: " "'"$NUM_GPUS"'" "	Time: "$3}'
+
+  rm tmp.log
+done
--- a/tools/tune.py
+++ b/tools/tune.py
@@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments

@@ -41,13 +41,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('error_rate_type',  str,

--- a/train.py
+++ b/train.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments

@@ -25,6 +25,7 @@ add_arg('num_iter_print',   int,    100,    "Every # iterations for printing "
 add_arg('learning_rate',    float,  5e-4,   "Learning rate.")
 add_arg('max_duration',     float,  27.0,   "Longest audio duration allowed.")
 add_arg('min_duration',     float,  0.0,    "Shortest audio duration allowed.")
+add_arg('test_off',         bool,   False,  "Turn off testing.")
 add_arg('use_sortagrad',    bool,   True,   "Use SortaGrad or not.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
@@ -41,14 +42,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
        None,
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('output_model_dir', str,
-        "./checkpoints",
+        "./checkpoints/libri",
        "Directory for saving checkpoints.")
 add_arg('augment_conf_path',str,
        'conf/augmentation.config',
@@ -111,7 +112,8 @@ def train():
        num_passes=args.num_passes,
        num_iterations_print=args.num_iter_print,
        output_model_dir=args.output_model_dir,
-        is_local=args.is_local)
+        is_local=args.is_local,
+        test_off=args.test_off)


 def main():

--- a/utils/utility.sh
+++ b/utils/utility.sh
+download() {
+    URL=$1
+    MD5=$2
+    TARGET=$3
+
+    if [ -e $TARGET ]; then
+        md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+        if [ $MD5 == $md5_result ]; then
+            echo "$TARGET already exists, download skipped."
+            return 0
+        fi
+    fi
+
+    wget -c $URL -O "$TARGET"
+    if [ $? -ne 0 ]; then
+        return 1
+    fi
+
+    md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+    if [ ! $MD5 == $md5_result ]; then
+        return 1
+    fi
+}