rm tune.py in root dir of DS2

f7032c82 · Yibing Liu · d43b33c1 · 7b938589 · f7032c82 · f7032c82
88 changed file
--- a/README.md
+++ b/README.md
--- a/cloud/README.md
+++ b/cloud/README.md
+# Train DeepSpeech2 on PaddleCloud
+>Note:
+>Please make sure [PaddleCloud Client](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#%E4%B8%8B%E8%BD%BD%E5%B9%B6%E9%85%8D%E7%BD%AEpaddlecloud) has be installed and current directory is `deep_speech_2/cloud/`
+## Step 1:  Upload Data
+Provided with several input manifests, `pcloud_upload_data.sh` will pack and upload all the containing audio files to PaddleCloud filesystem, and also generate some corresponding manifest files with updated cloud paths.
+Please modify the following arguments in `pcloud_upload_data.sh`:
+- `IN_MANIFESTS`： Paths (in local filesystem) of manifest files containing the audio files to be uploaded. Multiple paths can be concatenated with a whitespace delimeter.
+- `OUT_MANIFESTS`: Paths (in local filesystem) to write the updated output manifest files to. Multiple paths can be concatenated with a whitespace delimeter. The values of `audio_filepath` in the output manifests are updated with cloud filesystem paths.
+- `CLOUD_DATA_DIR`:  Directory (in PaddleCloud filesystem) to upload the data to. Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
+- `NUM_SHARDS`: Number of data shards / parts (in tar files) to be generated when packing and uploading data. Smaller `num_shards` requires larger temoporal local disk space for packing data.
+By running:
+```
+sh pcloud_upload_data.sh
+```
+all the audio files will be uploaded to PaddleCloud filesystem, and you will get modified manifests files in `OUT_MANIFESTS`.
+You have to take this step only once, in the very first time you do the cloud training. Later on, the data is persisitent on the cloud filesystem and reusable for further job submissions.
+## Step 2:  Configure Training
+Configure cloud training arguments in `pcloud_submit.sh`, with the following arguments:
+- `TRAIN_MANIFEST`: Manifest filepath (in local filesystem) for training. Notice that the`audio_filepath` should be in cloud filesystem, like those generated by `pcloud_upload_data.sh`.
+- `DEV_MANIFEST`: Manifest filepath (in local filesystem) for validation.
+- `CLOUD_MODEL_DIR`: Directory (in PaddleCloud filesystem) to save the model parameters (checkpoints). Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
+- `BATCH_SIZE`: Training batch size for a single node.
+- `NUM_GPU`: Number of GPUs allocated for a single node.
+- `NUM_NODE`: Number of nodes (machines) allocated for this job.
+- `IS_LOCAL`: Set to False to enable parameter server, if using multiple nodes.
+Configure other training hyper-parameters in `pcloud_train.sh` as you wish, just as what you can do in local training.
+By running:
+```
+sh pcloud_submit.sh
+```
+you submit a training job to PaddleCloud. And you will see the job name when the submission is done.
+## Step 3  Get Job Logs
+Run this to list all the jobs you have submitted, as well as their running status:
+```
+paddlecloud get jobs
+```
+Run this, the corresponding job's logs will be printed.
+```
+paddlecloud logs -n 10000 $REPLACED_WITH_YOUR_ACTUAL_JOB_NAME
+```
+## More Help
+For more information about the usage of PaddleCloud, please refer to [PaddleCloud Usage](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#提交任务).
--- a/cloud/_init_paths.py
+++ b/cloud/_init_paths.py
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os.path
+import sys
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+this_dir = os.path.dirname(__file__)
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
--- a/cloud/pcloud_submit.sh
+++ b/cloud/pcloud_submit.sh
+#! /usr/bin/env bash
+TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
+DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
+CLOUD_MODEL_DIR="./checkpoints"
+BATCH_SIZE=512
+NUM_GPU=8
+NUM_NODE=1
+IS_LOCAL="True"
+JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
+DS2_PATH=${PWD%/*}
+cp -f  pcloud_train.sh ${DS2_PATH}
+paddlecloud submit \
+-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \
+-jobname ${JOB_NAME} \
+-cpu ${NUM_GPU} \
+-gpu ${NUM_GPU} \
+-memory 64Gi \
+-parallelism ${NUM_NODE} \
+-pscpu 1 \
+-pservers 1 \
+-psmemory 64Gi \
+-passes 1 \
+-entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
+${DS2_PATH}
+rm ${DS2_PATH}/pcloud_train.sh
--- a/cloud/pcloud_train.sh
+++ b/cloud/pcloud_train.sh
+#! /usr/bin/env bash
+TRAIN_MANIFEST=$1
+DEV_MANIFEST=$2
+MODEL_PATH=$3
+NUM_GPU=$4
+BATCH_SIZE=$5
+IS_LOCAL=$6
+python ./cloud/split_data.py \
+--in_manifest_path=${TRAIN_MANIFEST} \
+--out_manifest_path='/local.manifest.train'
+python ./cloud/split_data.py \
+--in_manifest_path=${DEV_MANIFEST} \
+--out_manifest_path='/local.manifest.dev'
+mkdir ./logs
+python -u train.py \
+--batch_size=${BATCH_SIZE} \
+--trainer_count=${NUM_GPU} \
+--num_passes=200 \
+--num_proc_data=${NUM_GPU} \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=5e-4 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=${IS_LOCAL} \
+--share_rnn_weights=True \
+--train_manifest='/local.manifest.train' \
+--dev_manifest='/local.manifest.dev' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/vocab.txt' \
+--output_model_dir='./checkpoints' \
+--output_model_dir=${MODEL_PATH} \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped' \
+2>&1 | tee ./logs/train.log
--- a/cloud/pcloud_upload_data.sh
+++ b/cloud/pcloud_upload_data.sh
+#! /usr/bin/env bash
+mkdir cloud_manifests
+IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
+OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
+CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
+NUM_SHARDS=50
+python upload_data.py \
+--in_manifest_paths ${IN_MANIFESTS} \
+--out_manifest_paths ${OUT_MANIFESTS} \
+--cloud_data_dir ${CLOUD_DATA_DIR} \
+--num_shards ${NUM_SHARDS}
+if [ $? -ne 0 ]
+then
+    echo "Upload Data Failed!"
+    exit 1
+fi
+echo "All Done."
--- a/cloud/split_data.py
+++ b/cloud/split_data.py
+"""This tool is used for splitting data into each node of
+paddlecloud. This script should be called in paddlecloud.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import json
+import argparse
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--in_manifest_path",
+    type=str,
+    required=True,
+    help="Input manifest path for all nodes.")
+parser.add_argument(
+    "--out_manifest_path",
+    type=str,
+    required=True,
+    help="Output manifest file path for current node.")
+args = parser.parse_args()
+def split_data(in_manifest_path, out_manifest_path):
+    with open("/trainer_id", "r") as f:
+        trainer_id = int(f.readline()[:-1])
+    with open("/trainer_count", "r") as f:
+        trainer_count = int(f.readline()[:-1])
+    out_manifest = []
+    for index, json_line in enumerate(open(in_manifest_path, 'r')):
+        if (index % trainer_count) == trainer_id:
+            out_manifest.append("%s\n" % json_line.strip())
+    with open(out_manifest_path, 'w') as f:
+        f.writelines(out_manifest)
+if __name__ == '__main__':
+    split_data(args.in_manifest_path, args.out_manifest_path)
--- a/cloud/upload_data.py
+++ b/cloud/upload_data.py
+"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
+Steps:
+1. Read original manifests and extract local sound files.
+2. Tar all local sound files into multiple tar files and upload them.
+3. Modify original manifests with updated paths in cloud filesystem.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import os
+import tarfile
+import sys
+import argparse
+import shutil
+from subprocess import call
+import _init_paths
+from data_utils.utils import read_manifest
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--in_manifest_paths",
+    default=[
+        "../datasets/manifest.train", "../datasets/manifest.dev",
+        "../datasets/manifest.test"
+    ],
+    type=str,
+    nargs='+',
+    help="Local filepaths of input manifests to load, pack and upload."
+    "(default: %(default)s)")
+parser.add_argument(
+    "--out_manifest_paths",
+    default=[
+        "./cloud.manifest.train", "./cloud.manifest.dev",
+        "./cloud.manifest.test"
+    ],
+    type=str,
+    nargs='+',
+    help="Local filepaths of modified manifests to write to. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--cloud_data_dir",
+    required=True,
+    type=str,
+    help="Destination directory on paddlecloud to upload data to.")
+parser.add_argument(
+    "--num_shards",
+    default=10,
+    type=int,
+    help="Number of parts to split data to. (default: %(default)s)")
+parser.add_argument(
+    "--local_tmp_dir",
+    default="./tmp/",
+    type=str,
+    help="Local directory for storing temporary data. (default: %(default)s)")
+args = parser.parse_args()
+def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
+                upload_tar_dir, num_shards):
+    """Extract and pack sound files listed in the manifest files into multple
+    tar files and upload them to padldecloud. Besides, generate new manifest
+    files with updated paths in paddlecloud.
+    """
+    # compute total audio number
+    total_line = 0
+    for manifest_path in in_manifest_path_list:
+        with open(manifest_path, 'r') as f:
+            total_line += len(f.readlines())
+    line_per_tar = (total_line // num_shards) + 1
+    # pack and upload shard by shard
+    line_count, tar_file = 0, None
+    for manifest_path, out_manifest_path in zip(in_manifest_path_list,
+                                                out_manifest_path_list):
+        manifest = read_manifest(manifest_path)
+        out_manifest = []
+        for json_data in manifest:
+            sound_filepath = json_data['audio_filepath']
+            sound_filename = os.path.basename(sound_filepath)
+            if line_count % line_per_tar == 0:
+                if tar_file != None:
+                    tar_file.close()
+                    pcloud_cp(tar_path, upload_tar_dir)
+                    os.remove(tar_path)
+                tar_name = 'part-%s-of-%s.tar' % (
+                    str(line_count // line_per_tar).zfill(5),
+                    str(num_shards).zfill(5))
+                tar_path = os.path.join(local_tmp_dir, tar_name)
+                tar_file = tarfile.open(tar_path, 'w')
+            tar_file.add(sound_filepath, arcname=sound_filename)
+            line_count += 1
+            json_data['audio_filepath'] = "tar:%s#%s" % (
+                os.path.join(upload_tar_dir, tar_name), sound_filename)
+            out_manifest.append("%s\n" % json.dumps(json_data))
+        with open(out_manifest_path, 'w') as f:
+            f.writelines(out_manifest)
+        pcloud_cp(out_manifest_path, upload_tar_dir)
+    tar_file.close()
+    pcloud_cp(tar_path, upload_tar_dir)
+    os.remove(tar_path)
+def pcloud_mkdir(dir):
+    """Make directory in PaddleCloud filesystem.
+    """
+    if call(['paddlecloud', 'mkdir', dir]) != 0:
+        raise IOError("PaddleCloud mkdir failed: %s." % dir)
+def pcloud_cp(src, dst):
+    """Copy src from local filesytem to dst in PaddleCloud filesystem,
+    or downlowd src from PaddleCloud filesystem to dst in local filesystem.
+    """
+    if call(['paddlecloud', 'cp', src, dst]) != 0:
+        raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
+if __name__ == '__main__':
+    if not os.path.exists(args.local_tmp_dir):
+        os.makedirs(args.local_tmp_dir)
+    pcloud_mkdir(args.cloud_data_dir)
+    upload_data(args.in_manifest_paths, args.out_manifest_paths,
+                args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
+    shutil.rmtree(args.local_tmp_dir)
--- a/data/aishell/aishell.py
+++ b/data/aishell/aishell.py
+"""Prepare Aishell mandarin dataset
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import codecs
+import soundfile
+import json
+import argparse
+from data_utils.utility import download, unpack
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL_ROOT = 'http://www.openslr.org/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '': continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+    data_types = ['train', 'dev', 'test']
+    for type in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', type)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.join(subfolder, fname)
+                audio_id = fname[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'audio_filepath': audio_path,
+                            'duration': duration,
+                            'text': text
+                        },
+                        ensure_ascii=False))
+        manifest_path = manifest_path_prefix + '.' + type
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    create_manifest(data_dir, manifest_path)
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix)
+if __name__ == '__main__':
+    main()
--- a/datasets/librispeech/librispeech.py
+++ b/datasets/librispeech/librispeech.py
@@ -12,13 +12,11 @@ from __future__ import print_function
 import distutils.util
 import os
 import sys
-import tarfile
 import argparse
 import soundfile
 import json
-from paddle.v2.dataset.common import md5file
+import codecs
+from data_utils.utility import download, unpack
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
@@ -40,7 +38,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@@ -58,36 +56,8 @@ parser.add_argument(
 args = parser.parse_args()
-def download(url, md5sum, target_dir):
-    """
-    Download file from url to target_dir, and check md5sum.
-    """
-    if not os.path.exists(target_dir): os.makedirs(target_dir)
-    filepath = os.path.join(target_dir, url.split("/")[-1])
-    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
-        print("Downloading %s ..." % url)
-        os.system("wget -c " + url + " -P " + target_dir)
-        print("\nMD5 Chesksum %s ..." % filepath)
-        if not md5file(filepath) == md5sum:
-            raise RuntimeError("MD5 checksum failed.")
-    else:
-        print("File exists, skip downloading. (%s)" % filepath)
-    return filepath
-def unpack(filepath, target_dir):
-    """
-    Unpack the file to the target_dir.
-    """
-    print("Unpacking %s ..." % filepath)
-    tar = tarfile.open(filepath)
-    tar.extractall(target_dir)
-    tar.close()
 def create_manifest(data_dir, manifest_path):
-    """
+    """Create a manifest json file summarizing the data set, with each line
-    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
@@ -112,14 +82,13 @@ def create_manifest(data_dir, manifest_path):
                        'duration': duration,
                        'text': text
                    }))
-    with open(manifest_path, 'w') as out_file:
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """
+    """Download, unpack and create summmary manifest file.
-    Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
@@ -134,6 +103,9 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
 def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
@@ -144,12 +116,12 @@ def main():
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
-    prepare_dataset(
-        url=URL_TRAIN_CLEAN_100,
-        md5sum=MD5_TRAIN_CLEAN_100,
-        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,

--- a/datasets/noise/chime3_background.py
+++ b/datasets/noise/chime3_background.py
--- a/data_utils/augmentor/impulse_response.py
+++ b/data_utils/augmentor/impulse_response.py
@@ -4,23 +4,22 @@ from __future__ import division
 from __future__ import print_function
 from data_utils.augmentor.base import AugmentorBase
-from data_utils import utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
 class ImpulseResponseAugmentor(AugmentorBase):
    """Augmentation model for adding impulse response effect.
    :param rng: Random generator object.
    :type rng: random.Random
    :param impulse_manifest_path: Manifest path for impulse audio data.
-    :type impulse_manifest_path: basestring 
+    :type impulse_manifest_path: basestring
    """
    def __init__(self, rng, impulse_manifest_path):
        self._rng = rng
-        self._impulse_manifest = utils.read_manifest(
+        self._impulse_manifest = read_manifest(impulse_manifest_path)
-            manifest_path=impulse_manifest_path)
    def transform_audio(self, audio_segment):
        """Add impulse response effect.

--- a/data_utils/augmentor/noise_perturb.py
+++ b/data_utils/augmentor/noise_perturb.py
@@ -4,13 +4,13 @@ from __future__ import division
 from __future__ import print_function
 from data_utils.augmentor.base import AugmentorBase
-from data_utils import utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
 class NoisePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding background noise.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_snr_dB: Minimal signal noise ratio, in decibels.
@@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase):
    :param max_snr_dB: Maximal signal noise ratio, in decibels.
    :type max_snr_dB: float
    :param noise_manifest_path: Manifest path for noise audio data.
-    :type noise_manifest_path: basestring 
+    :type noise_manifest_path: basestring
    """
    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
        self._min_snr_dB = min_snr_dB
        self._max_snr_dB = max_snr_dB
        self._rng = rng
-        self._noise_manifest = utils.read_manifest(
+        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
-            manifest_path=noise_manifest_path)
    def transform_audio(self, audio_segment):
        """Add background noise audio.

--- a/data_utils/data.py
+++ b/data_utils/data.py
@@ -6,10 +6,12 @@ from __future__ import division
 from __future__ import print_function
 import random
-import numpy as np
+import tarfile
 import multiprocessing
+import numpy as np
 import paddle.v2 as paddle
-from data_utils import utils
+from threading import local
+from data_utils.utility import read_manifest
 from data_utils.augmentor.augmentation import AugmentationPipeline
 from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
 from data_utils.speech import SpeechSegment
@@ -46,7 +48,7 @@ class DataGenerator(object):
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param use_dB_normalization: Whether to normalize the audio to -20 dB
-                                 before extracting the features.
+                                before extracting the features.
    :type use_dB_normalization: bool
    :param num_threads: Number of CPU threads for processing data.
    :type num_threads: int
@@ -82,16 +84,20 @@ class DataGenerator(object):
        self._num_threads = num_threads
        self._rng = random.Random(random_seed)
        self._epoch = 0
+        # for caching tar files info
+        self._local_data = local()
+        self._local_data.tar2info = {}
+        self._local_data.tar2object = {}
    def process_utterance(self, filename, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param filename: Audio filepath
-        :type filename: basestring
+        :type filename: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and list of token ids for
-                 transcription. 
+                 transcription.
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_file(filename, transcript)
@@ -111,7 +117,7 @@ class DataGenerator(object):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.
        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.
@@ -153,7 +159,7 @@ class DataGenerator(object):
        def batch_reader():
            # read manifest
-            manifest = utils.read_manifest(
+            manifest = read_manifest(
                manifest_path=manifest_path,
                max_duration=self._max_duration,
                min_duration=self._min_duration)
@@ -191,9 +197,9 @@ class DataGenerator(object):
    @property
    def feeding(self):
        """Returns data reader's feeding dict.
        :return: Data feeding dict.
-        :rtype: dict 
+        :rtype: dict
        """
        return {"audio_spectrogram": 0, "transcript_text": 1}
@@ -215,6 +221,38 @@ class DataGenerator(object):
        """
        return self._speech_featurizer.vocab_list
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+    def _get_file_object(self, file):
+        """Get file object by file path.
+        If file startwith tar, it will return a tar file object
+        and cached tar file info for next reading request.
+        It will return file directly, if the type of file is not str.
+        """
+        if file.startswith('tar:'):
+            tarpath, filename = file.split(':', 1)[1].split('#', 1)
+            if 'tar2info' not in self._local_data.__dict__:
+                self._local_data.tar2info = {}
+            if 'tar2object' not in self._local_data.__dict__:
+                self._local_data.tar2object = {}
+            if tarpath not in self._local_data.tar2info:
+                object, infoes = self._parse_tar(tarpath)
+                self._local_data.tar2info[tarpath] = infoes
+                self._local_data.tar2object[tarpath] = object
+            return self._local_data.tar2object[tarpath].extractfile(
+                self._local_data.tar2info[tarpath][filename])
+        else:
+            return open(file, 'r')
    def _instance_reader_creator(self, manifest):
        """
        Instance reader creator. Create a callable function to produce
@@ -229,8 +267,9 @@ class DataGenerator(object):
                yield instance
        def mapper(instance):
-            return self.process_utterance(instance["audio_filepath"],
+            return self.process_utterance(
-                                          instance["text"])
+                self._get_file_object(instance["audio_filepath"]),
+                instance["text"])
        return paddle.reader.xmap_readers(
            mapper, reader, self._num_threads, 1024, order=True)

--- a/data_utils/featurizer/audio_featurizer.py
+++ b/data_utils/featurizer/audio_featurizer.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import numpy as np
-from data_utils import utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
 from python_speech_features import mfcc
 from python_speech_features import delta
@@ -57,7 +57,7 @@ class AudioFeaturizer(object):
    def featurize(self,
                  audio_segment,
                  allow_downsampling=True,
-                  allow_upsamplling=True):
+                  allow_upsampling=True):
        """Extract audio features from AudioSegment or SpeechSegment.
        :param audio_segment: Audio/speech segment to extract features from.
@@ -159,24 +159,27 @@ class AudioFeaturizer(object):
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
-            raise ValueError("max_freq must be greater than half of "
+            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
-        # compute 13 cepstral coefficients, and the first one is replaced
+        # compute the 13 cepstral coefficients, and the first one is replaced
        # by log(frame energy)
-        mfcc_feat = np.transpose(
+        mfcc_feat = mfcc(
-            mfcc(
+            signal=samples,
-                signal=samples,
+            samplerate=sample_rate,
-                samplerate=sample_rate,
+            winlen=0.001 * window_ms,
-                winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
-                winstep=0.001 * stride_ms,
+            highfreq=max_freq)
-                highfreq=max_freq))
        # Deltas
        d_mfcc_feat = delta(mfcc_feat, 2)
        # Deltas-Deltas
        dd_mfcc_feat = delta(d_mfcc_feat, 2)
+        # transpose
+        mfcc_feat = np.transpose(mfcc_feat)
+        d_mfcc_feat = np.transpose(d_mfcc_feat)
+        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
        # concat above three features
        concat_mfcc_feat = np.concatenate(
            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))

--- a/data_utils/featurizer/text_featurizer.py
+++ b/data_utils/featurizer/text_featurizer.py
@@ -4,6 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import os
+import codecs
 class TextFeaturizer(object):
@@ -59,7 +60,7 @@ class TextFeaturizer(object):
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
-        with open(vocab_filepath, 'r') as file:
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(

--- a/data_utils/normalizer.py
+++ b/data_utils/normalizer.py
@@ -5,7 +5,7 @@ from __future__ import print_function
 import numpy as np
 import random
-import data_utils.utils as utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
@@ -75,7 +75,7 @@ class FeatureNormalizer(object):
    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
        """Compute mean and std from randomly sampled instances."""
-        manifest = utils.read_manifest(manifest_path)
+        manifest = read_manifest(manifest_path)
        sampled_manifest = self._rng.sample(manifest, num_samples)
        features = []
        for instance in sampled_manifest:

--- a/data_utils/utils.py
+++ b/data_utils/utils.py
@@ -4,15 +4,19 @@ from __future__ import division
 from __future__ import print_function
 import json
+import codecs
+import os
+import tarfile
+from paddle.v2.dataset.common import md5file
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    """Load and parse manifest file.
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.
-    :param manifest_path: Manifest file to load and parse. 
+    :param manifest_path: Manifest file to load and parse.
    :type manifest_path: basestring
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
@@ -23,7 +27,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    :raises IOError: If failed to parse the manifest.
    """
    manifest = []
-    for json_line in open(manifest_path):
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
@@ -32,3 +36,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
+def download(url, md5sum, target_dir):
+    """Download file from url to target_dir, and check md5sum."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir, rm_tar=False):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar == True:
+        os.remove(filepath)
--- a/datasets/run_all.sh
+++ b/datasets/run_all.sh
-cd librispeech
-python librispeech.py
-if [ $? -ne 0 ]; then
-    echo "Prepare LibriSpeech failed. Terminated."
-    exit 1
-fi
-cd -
-cat librispeech/manifest.train* | shuf > manifest.train
-cat librispeech/manifest.dev-clean > manifest.dev
-cat librispeech/manifest.test-clean > manifest.test
-echo "All done."
--- a/datasets/run_noise.sh
+++ b/datasets/run_noise.sh
-cd noise 
-python chime3_background.py
-if [ $? -ne 0 ]; then
-    echo "Prepare CHiME3 background noise failed. Terminated."
-    exit 1
-fi
-cd -
-cat noise/manifest.* > manifest.noise
-echo "All done."
--- a/datasets/vocab/eng_vocab.txt
+++ b/datasets/vocab/eng_vocab.txt
-'
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
--- a/lm/__init__.py
+++ b/lm/__init__.py
--- a/decoder.py
+++ b/decoder.py
@@ -9,8 +9,9 @@ from math import log
 import multiprocessing
-def ctc_best_path_decoder(probs_seq, vocabulary):
+def ctc_greedy_decoder(probs_seq, vocabulary):
-    """Best path decoder, also called argmax decoder or greedy decoder.
+    """CTC greedy (best path) decoder.
    Path consisting of the most probable tokens are further post-processed to
    remove consecutive repetitions and all blanks.
@@ -41,14 +42,16 @@ def ctc_best_path_decoder(probs_seq, vocabulary):
 def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            vocabulary,
-                            blank_id,
                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
                            ext_scoring_func=None,
                            nproc=False):
-    """Beam search decoder for CTC-trained network. It utilizes beam search
+    """CTC Beam search decoder.
-    to approximately select top best decoding labels and returning results
-    in the descending order. The implementation is based on Prefix
+    It utilizes beam search to approximately select top best decoding
-    Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is
+    labels and returning results in the descending order.
+    The implementation is based on Prefix Beam Search
+    (https://arxiv.org/abs/1408.2873), and the unclear part is
    redesigned. Two important modifications: 1) in the iterative computation
    of probabilities, the assignment operation is changed to accumulation for
    one prefix may comes from different paths; 2) the if condition "if l^+ not
@@ -63,8 +66,6 @@ def ctc_beam_search_decoder(probs_seq,
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
@@ -84,9 +85,8 @@ def ctc_beam_search_decoder(probs_seq,
            raise ValueError("The shape of prob_seq does not match with the "
                             "shape of the vocabulary.")
-    # blank_id check
+    # blank_id assign
-    if not blank_id < len(probs_seq[0]):
+    blank_id = len(vocabulary)
-        raise ValueError("blank_id shouldn't be greater than probs dimension")
    # If the decoder called in the multiprocesses, then use the global scorer
    # instantiated in ctc_beam_search_decoder_batch().
@@ -111,7 +111,7 @@ def ctc_beam_search_decoder(probs_seq,
        prob_idx = list(enumerate(probs_seq[time_step]))
        cutoff_len = len(prob_idx)
        #If pruning is enabled
-        if cutoff_prob < 1.0:
+        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
            cutoff_len, cum_prob = 0, 0.0
            for i in xrange(len(prob_idx)):
@@ -119,6 +119,7 @@ def ctc_beam_search_decoder(probs_seq,
                cutoff_len += 1
                if cum_prob >= cutoff_prob:
                    break
+            cutoff_len = min(cutoff_len, cutoff_top_n)
            prob_idx = prob_idx[0:cutoff_len]
        for l in prefix_set_prev:
@@ -177,6 +178,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
@@ -186,9 +189,9 @@ def ctc_beam_search_decoder(probs_seq,
 def ctc_beam_search_decoder_batch(probs_split,
                                  beam_size,
                                  vocabulary,
-                                  blank_id,
                                  num_processes,
                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
                                  ext_scoring_func=None):
    """CTC beam search decoder using multiple processes.
@@ -199,8 +202,6 @@ def ctc_beam_search_decoder_batch(probs_split,
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
-    :param blank_id: ID of blank.
-    :type blank_id: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in pruning,
@@ -227,8 +228,8 @@ def ctc_beam_search_decoder_batch(probs_split,
    pool = multiprocessing.Pool(processes=num_processes)
    results = []
    for i, probs_list in enumerate(probs_split):
-        args = (probs_list, beam_size, vocabulary, blank_id, cutoff_prob, None,
+        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
-                nproc)
+                None, nproc)
        results.append(pool.apply_async(ctc_beam_search_decoder, args))
    pool.close()

--- a/lm/lm_scorer.py
+++ b/lm/lm_scorer.py
@@ -8,7 +8,7 @@ import kenlm
 import numpy as np
-class LmScorer(object):
+class Scorer(object):
    """External scorer to evaluate a prefix or whole sentence in
       beam search decoding, including the score from n-gram language
       model and word count.

--- a/decoders/swig/__init__.py
+++ b/decoders/swig/__init__.py
--- a/decoders/swig/_init_paths.py
+++ b/decoders/swig/_init_paths.py
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os.path
+import sys
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+this_dir = os.path.dirname(__file__)
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
--- a/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/decoders/swig/ctc_beam_search_decoder.cpp
+#include "ctc_beam_search_decoder.h"
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <utility>
+#include "ThreadPool.h"
+#include "fst/fstlib.h"
+#include "decoder_utils.h"
+#include "path_trie.h"
+using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    double cutoff_prob,
+    size_t cutoff_top_n,
+    Scorer *ext_scorer) {
+  // dimension check
+  size_t num_time_steps = probs_seq.size();
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
+  }
+  // assign blank id
+  size_t blank_id = vocabulary.size();
+  // assign space id
+  auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+  int space_id = it - vocabulary.begin();
+  // if no space in vocabulary
+  if ((size_t)space_id >= vocabulary.size()) {
+    space_id = -2;
+  }
+  // init prefixes' root
+  PathTrie root;
+  root.score = root.log_prob_b_prev = 0.0;
+  std::vector<PathTrie *> prefixes;
+  prefixes.push_back(&root);
+  if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
+    auto fst_dict = static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
+    fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
+    root.set_dictionary(dict_ptr);
+    auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
+    root.set_matcher(matcher);
+  }
+  // prefix search over time
+  for (size_t time_step = 0; time_step < num_time_steps; ++time_step) {
+    auto &prob = probs_seq[time_step];
+    float min_cutoff = -NUM_FLT_INF;
+    bool full_beam = false;
+    if (ext_scorer != nullptr) {
+      size_t num_prefixes = std::min(prefixes.size(), beam_size);
+      std::sort(
+          prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
+      min_cutoff = prefixes[num_prefixes - 1]->score +
+                   std::log(prob[blank_id]) - std::max(0.0, ext_scorer->beta);
+      full_beam = (num_prefixes == beam_size);
+    }
+    std::vector<std::pair<size_t, float>> log_prob_idx =
+        get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
+    // loop over chars
+    for (size_t index = 0; index < log_prob_idx.size(); index++) {
+      auto c = log_prob_idx[index].first;
+      auto log_prob_c = log_prob_idx[index].second;
+      for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) {
+        auto prefix = prefixes[i];
+        if (full_beam && log_prob_c + prefix->score < min_cutoff) {
+          break;
+        }
+        // blank
+        if (c == blank_id) {
+          prefix->log_prob_b_cur =
+              log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score);
+          continue;
+        }
+        // repeated character
+        if (c == prefix->character) {
+          prefix->log_prob_nb_cur = log_sum_exp(
+              prefix->log_prob_nb_cur, log_prob_c + prefix->log_prob_nb_prev);
+        }
+        // get new prefix
+        auto prefix_new = prefix->get_path_trie(c);
+        if (prefix_new != nullptr) {
+          float log_p = -NUM_FLT_INF;
+          if (c == prefix->character &&
+              prefix->log_prob_b_prev > -NUM_FLT_INF) {
+            log_p = log_prob_c + prefix->log_prob_b_prev;
+          } else if (c != prefix->character) {
+            log_p = log_prob_c + prefix->score;
+          }
+          // language model scoring
+          if (ext_scorer != nullptr &&
+              (c == space_id || ext_scorer->is_character_based())) {
+            PathTrie *prefix_toscore = nullptr;
+            // skip scoring the space
+            if (ext_scorer->is_character_based()) {
+              prefix_toscore = prefix_new;
+            } else {
+              prefix_toscore = prefix;
+            }
+            double score = 0.0;
+            std::vector<std::string> ngram;
+            ngram = ext_scorer->make_ngram(prefix_toscore);
+            score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
+            log_p += score;
+            log_p += ext_scorer->beta;
+          }
+          prefix_new->log_prob_nb_cur =
+              log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
+        }
+      }  // end of loop over prefix
+    }    // end of loop over vocabulary
+    prefixes.clear();
+    // update log probs
+    root.iterate_to_vec(prefixes);
+    // only preserve top beam_size prefixes
+    if (prefixes.size() >= beam_size) {
+      std::nth_element(prefixes.begin(),
+                       prefixes.begin() + beam_size,
+                       prefixes.end(),
+                       prefix_compare);
+      for (size_t i = beam_size; i < prefixes.size(); ++i) {
+        prefixes[i]->remove();
+      }
+    }
+  }  // end of loop over time
+  // compute aproximate ctc score as the return score, without affecting the
+  // return order of decoding result. To delete when decoder gets stable.
+  for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
+    double approx_ctc = prefixes[i]->score;
+    if (ext_scorer != nullptr) {
+      std::vector<int> output;
+      prefixes[i]->get_path_vec(output);
+      auto prefix_length = output.size();
+      auto words = ext_scorer->split_labels(output);
+      // remove word insert
+      approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
+      // remove language model weight:
+      approx_ctc -= (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
+    }
+    prefixes[i]->approx_ctc = approx_ctc;
+  }
+  return get_beam_search_result(prefixes, vocabulary, beam_size);
+}
+std::vector<std::vector<std::pair<double, std::string>>>
+ctc_beam_search_decoder_batch(
+    const std::vector<std::vector<std::vector<double>>> &probs_split,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    size_t num_processes,
+    double cutoff_prob,
+    size_t cutoff_top_n,
+    Scorer *ext_scorer) {
+  VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
+  // thread pool
+  ThreadPool pool(num_processes);
+  // number of samples
+  size_t batch_size = probs_split.size();
+  // enqueue the tasks of decoding
+  std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
+  for (size_t i = 0; i < batch_size; ++i) {
+    res.emplace_back(pool.enqueue(ctc_beam_search_decoder,
+                                  probs_split[i],
+                                  vocabulary,
+                                  beam_size,
+                                  cutoff_prob,
+                                  cutoff_top_n,
+                                  ext_scorer));
+  }
+  // get decoding results
+  std::vector<std::vector<std::pair<double, std::string>>> batch_results;
+  for (size_t i = 0; i < batch_size; ++i) {
+    batch_results.emplace_back(res[i].get());
+  }
+  return batch_results;
+}
--- a/decoders/swig/ctc_beam_search_decoder.h
+++ b/decoders/swig/ctc_beam_search_decoder.h
+#ifndef CTC_BEAM_SEARCH_DECODER_H_
+#define CTC_BEAM_SEARCH_DECODER_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include "scorer.h"
+/* CTC Beam Search Decoder
+ * Parameters:
+ *     probs_seq: 2-D vector that each element is a vector of probabilities
+ *               over vocabulary of one time step.
+ *     vocabulary: A vector of vocabulary.
+ *     beam_size: The width of beam search.
+ *     cutoff_prob: Cutoff probability for pruning.
+ *     cutoff_top_n: Cutoff number for pruning.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
+ * Return:
+ *     A vector that each element is a pair of score  and decoding result,
+ *     in desending order.
+*/
+std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    double cutoff_prob = 1.0,
+    size_t cutoff_top_n = 40,
+    Scorer *ext_scorer = nullptr);
+/* CTC Beam Search Decoder for batch data
+ * Parameters:
+ *     probs_seq: 3-D vector that each element is a 2-D vector that can be used
+ *                by ctc_beam_search_decoder().
+ *     vocabulary: A vector of vocabulary.
+ *     beam_size: The width of beam search.
+ *     num_processes: Number of threads for beam search.
+ *     cutoff_prob: Cutoff probability for pruning.
+ *     cutoff_top_n: Cutoff number for pruning.
+ *     ext_scorer: External scorer to evaluate a prefix, which consists of
+ *                 n-gram language model scoring and word insertion term.
+ *                 Default null, decoding the input sample without scorer.
+ * Return:
+ *     A 2-D vector that each element is a vector of beam search decoding
+ *     result for one audio sample.
+*/
+std::vector<std::vector<std::pair<double, std::string>>>
+ctc_beam_search_decoder_batch(
+    const std::vector<std::vector<std::vector<double>>> &probs_split,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size,
+    size_t num_processes,
+    double cutoff_prob = 1.0,
+    size_t cutoff_top_n = 40,
+    Scorer *ext_scorer = nullptr);
+#endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/decoders/swig/ctc_greedy_decoder.cpp
+++ b/decoders/swig/ctc_greedy_decoder.cpp
+#include "ctc_greedy_decoder.h"
+#include "decoder_utils.h"
+std::string ctc_greedy_decoder(
+    const std::vector<std::vector<double>> &probs_seq,
+    const std::vector<std::string> &vocabulary) {
+  // dimension check
+  size_t num_time_steps = probs_seq.size();
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    VALID_CHECK_EQ(probs_seq[i].size(),
+                   vocabulary.size() + 1,
+                   "The shape of probs_seq does not match with "
+                   "the shape of the vocabulary");
+  }
+  size_t blank_id = vocabulary.size();
+  std::vector<size_t> max_idx_vec(num_time_steps, 0);
+  std::vector<size_t> idx_vec;
+  for (size_t i = 0; i < num_time_steps; ++i) {
+    double max_prob = 0.0;
+    size_t max_idx = 0;
+    const std::vector<double> &probs_step = probs_seq[i];
+    for (size_t j = 0; j < probs_step.size(); ++j) {
+      if (max_prob < probs_step[j]) {
+        max_idx = j;
+        max_prob = probs_step[j];
+      }
+    }
+    // id with maximum probability in current time step
+    max_idx_vec[i] = max_idx;
+    // deduplicate
+    if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) {
+      idx_vec.push_back(max_idx_vec[i]);
+    }
+  }
+  std::string best_path_result;
+  for (size_t i = 0; i < idx_vec.size(); ++i) {
+    if (idx_vec[i] != blank_id) {
+      best_path_result += vocabulary[idx_vec[i]];
+    }
+  }
+  return best_path_result;
+}
--- a/decoders/swig/ctc_greedy_decoder.h
+++ b/decoders/swig/ctc_greedy_decoder.h
+#ifndef CTC_GREEDY_DECODER_H
+#define CTC_GREEDY_DECODER_H
+#include <string>
+#include <vector>
+/* CTC Greedy (Best Path) Decoder
+ *
+ * Parameters:
+ *     probs_seq: 2-D vector that each element is a vector of probabilities
+ *               over vocabulary of one time step.
+ *     vocabulary: A vector of vocabulary.
+ * Return:
+ *     The decoding result in string
+ */
+std::string ctc_greedy_decoder(
+    const std::vector<std::vector<double>>& probs_seq,
+    const std::vector<std::string>& vocabulary);
+#endif  // CTC_GREEDY_DECODER_H
--- a/decoders/swig/decoder_utils.cpp
+++ b/decoders/swig/decoder_utils.cpp
+#include "decoder_utils.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+std::vector<std::pair<size_t, float>> get_pruned_log_probs(
+    const std::vector<double> &prob_step,
+    double cutoff_prob,
+    size_t cutoff_top_n) {
+  std::vector<std::pair<int, double>> prob_idx;
+  for (size_t i = 0; i < prob_step.size(); ++i) {
+    prob_idx.push_back(std::pair<int, double>(i, prob_step[i]));
+  }
+  // pruning of vacobulary
+  size_t cutoff_len = prob_step.size();
+  if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) {
+    std::sort(
+        prob_idx.begin(), prob_idx.end(), pair_comp_second_rev<int, double>);
+    if (cutoff_prob < 1.0) {
+      double cum_prob = 0.0;
+      cutoff_len = 0;
+      for (size_t i = 0; i < prob_idx.size(); ++i) {
+        cum_prob += prob_idx[i].second;
+        cutoff_len += 1;
+        if (cum_prob >= cutoff_prob || cutoff_len >= cutoff_top_n) break;
+      }
+    }
+    prob_idx = std::vector<std::pair<int, double>>(
+        prob_idx.begin(), prob_idx.begin() + cutoff_len);
+  }
+  std::vector<std::pair<size_t, float>> log_prob_idx;
+  for (size_t i = 0; i < cutoff_len; ++i) {
+    log_prob_idx.push_back(std::pair<int, float>(
+        prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN)));
+  }
+  return log_prob_idx;
+}
+std::vector<std::pair<double, std::string>> get_beam_search_result(
+    const std::vector<PathTrie *> &prefixes,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size) {
+  // allow for the post processing
+  std::vector<PathTrie *> space_prefixes;
+  if (space_prefixes.empty()) {
+    for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
+      space_prefixes.push_back(prefixes[i]);
+    }
+  }
+  std::sort(space_prefixes.begin(), space_prefixes.end(), prefix_compare);
+  std::vector<std::pair<double, std::string>> output_vecs;
+  for (size_t i = 0; i < beam_size && i < space_prefixes.size(); ++i) {
+    std::vector<int> output;
+    space_prefixes[i]->get_path_vec(output);
+    // convert index to string
+    std::string output_str;
+    for (size_t j = 0; j < output.size(); j++) {
+      output_str += vocabulary[output[j]];
+    }
+    std::pair<double, std::string> output_pair(-space_prefixes[i]->approx_ctc,
+                                               output_str);
+    output_vecs.emplace_back(output_pair);
+  }
+  return output_vecs;
+}
+size_t get_utf8_str_len(const std::string &str) {
+  size_t str_len = 0;
+  for (char c : str) {
+    str_len += ((c & 0xc0) != 0x80);
+  }
+  return str_len;
+}
+std::vector<std::string> split_utf8_str(const std::string &str) {
+  std::vector<std::string> result;
+  std::string out_str;
+  for (char c : str) {
+    if ((c & 0xc0) != 0x80)  // new UTF-8 character
+    {
+      if (!out_str.empty()) {
+        result.push_back(out_str);
+        out_str.clear();
+      }
+    }
+    out_str.append(1, c);
+  }
+  result.push_back(out_str);
+  return result;
+}
+std::vector<std::string> split_str(const std::string &s,
+                                   const std::string &delim) {
+  std::vector<std::string> result;
+  std::size_t start = 0, delim_len = delim.size();
+  while (true) {
+    std::size_t end = s.find(delim, start);
+    if (end == std::string::npos) {
+      if (start < s.size()) {
+        result.push_back(s.substr(start));
+      }
+      break;
+    }
+    if (end > start) {
+      result.push_back(s.substr(start, end - start));
+    }
+    start = end + delim_len;
+  }
+  return result;
+}
+bool prefix_compare(const PathTrie *x, const PathTrie *y) {
+  if (x->score == y->score) {
+    if (x->character == y->character) {
+      return false;
+    } else {
+      return (x->character < y->character);
+    }
+  } else {
+    return x->score > y->score;
+  }
+}
+void add_word_to_fst(const std::vector<int> &word,
+                     fst::StdVectorFst *dictionary) {
+  if (dictionary->NumStates() == 0) {
+    fst::StdVectorFst::StateId start = dictionary->AddState();
+    assert(start == 0);
+    dictionary->SetStart(start);
+  }
+  fst::StdVectorFst::StateId src = dictionary->Start();
+  fst::StdVectorFst::StateId dst;
+  for (auto c : word) {
+    dst = dictionary->AddState();
+    dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
+    src = dst;
+  }
+  dictionary->SetFinal(dst, fst::StdArc::Weight::One());
+}
+bool add_word_to_dictionary(
+    const std::string &word,
+    const std::unordered_map<std::string, int> &char_map,
+    bool add_space,
+    int SPACE_ID,
+    fst::StdVectorFst *dictionary) {
+  auto characters = split_utf8_str(word);
+  std::vector<int> int_word;
+  for (auto &c : characters) {
+    if (c == " ") {
+      int_word.push_back(SPACE_ID);
+    } else {
+      auto int_c = char_map.find(c);
+      if (int_c != char_map.end()) {
+        int_word.push_back(int_c->second);
+      } else {
+        return false;  // return without adding
+      }
+    }
+  }
+  if (add_space) {
+    int_word.push_back(SPACE_ID);
+  }
+  add_word_to_fst(int_word, dictionary);
+  return true;  // return with successful adding
+}
--- a/decoders/swig/decoder_utils.h
+++ b/decoders/swig/decoder_utils.h
+#ifndef DECODER_UTILS_H_
+#define DECODER_UTILS_H_
+#include <utility>
+#include "fst/log.h"
+#include "path_trie.h"
+const float NUM_FLT_INF = std::numeric_limits<float>::max();
+const float NUM_FLT_MIN = std::numeric_limits<float>::min();
+// inline function for validation check
+inline void check(
+    bool x, const char *expr, const char *file, int line, const char *err) {
+  if (!x) {
+    std::cout << "[" << file << ":" << line << "] ";
+    LOG(FATAL) << "\"" << expr << "\" check failed. " << err;
+  }
+}
+#define VALID_CHECK(x, info) \
+  check(static_cast<bool>(x), #x, __FILE__, __LINE__, info)
+#define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info)
+#define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info)
+#define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info)
+// Function template for comparing two pairs
+template <typename T1, typename T2>
+bool pair_comp_first_rev(const std::pair<T1, T2> &a,
+                         const std::pair<T1, T2> &b) {
+  return a.first > b.first;
+}
+// Function template for comparing two pairs
+template <typename T1, typename T2>
+bool pair_comp_second_rev(const std::pair<T1, T2> &a,
+                          const std::pair<T1, T2> &b) {
+  return a.second > b.second;
+}
+// Return the sum of two probabilities in log scale
+template <typename T>
+T log_sum_exp(const T &x, const T &y) {
+  static T num_min = -std::numeric_limits<T>::max();
+  if (x <= num_min) return y;
+  if (y <= num_min) return x;
+  T xmax = std::max(x, y);
+  return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax;
+}
+// Get pruned probability vector for each time step's beam search
+std::vector<std::pair<size_t, float>> get_pruned_log_probs(
+    const std::vector<double> &prob_step,
+    double cutoff_prob,
+    size_t cutoff_top_n);
+// Get beam search result from prefixes in trie tree
+std::vector<std::pair<double, std::string>> get_beam_search_result(
+    const std::vector<PathTrie *> &prefixes,
+    const std::vector<std::string> &vocabulary,
+    size_t beam_size);
+// Functor for prefix comparsion
+bool prefix_compare(const PathTrie *x, const PathTrie *y);
+/* Get length of utf8 encoding string
+ * See: http://stackoverflow.com/a/4063229
+ */
+size_t get_utf8_str_len(const std::string &str);
+/* Split a string into a list of strings on a given string
+ * delimiter. NB: delimiters on beginning / end of string are
+ * trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"].
+ */
+std::vector<std::string> split_str(const std::string &s,
+                                   const std::string &delim);
+/* Splits string into vector of strings representing
+ * UTF-8 characters (not same as chars)
+ */
+std::vector<std::string> split_utf8_str(const std::string &str);
+// Add a word in index to the dicionary of fst
+void add_word_to_fst(const std::vector<int> &word,
+                     fst::StdVectorFst *dictionary);
+// Add a word in string to dictionary
+bool add_word_to_dictionary(
+    const std::string &word,
+    const std::unordered_map<std::string, int> &char_map,
+    bool add_space,
+    int SPACE_ID,
+    fst::StdVectorFst *dictionary);
+#endif  // DECODER_UTILS_H
--- a/decoders/swig/decoders.i
+++ b/decoders/swig/decoders.i
+%module swig_decoders
+%{
+#include "scorer.h"
+#include "ctc_greedy_decoder.h"
+#include "ctc_beam_search_decoder.h"
+#include "decoder_utils.h"
+%}
+%include "std_vector.i"
+%include "std_pair.i"
+%include "std_string.i"
+%import "decoder_utils.h"
+namespace std {
+    %template(DoubleVector) std::vector<double>;
+    %template(IntVector) std::vector<int>;
+    %template(StringVector) std::vector<std::string>;
+    %template(VectorOfStructVector) std::vector<std::vector<double> >;
+    %template(FloatVector) std::vector<float>;
+    %template(Pair) std::pair<float, std::string>;
+    %template(PairFloatStringVector)  std::vector<std::pair<float, std::string> >;
+    %template(PairDoubleStringVector) std::vector<std::pair<double, std::string> >;
+    %template(PairDoubleStringVector2) std::vector<std::vector<std::pair<double, std::string> > >;
+    %template(DoubleVector3) std::vector<std::vector<std::vector<double> > >;
+}
+%template(IntDoublePairCompSecondRev) pair_comp_second_rev<int, double>;
+%template(StringDoublePairCompSecondRev) pair_comp_second_rev<std::string, double>;
+%template(DoubleStringPairCompFirstRev) pair_comp_first_rev<double, std::string>;
+%include "scorer.h"
+%include "ctc_greedy_decoder.h"
+%include "ctc_beam_search_decoder.h"
--- a/decoders/swig/path_trie.cpp
+++ b/decoders/swig/path_trie.cpp
+#include "path_trie.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "decoder_utils.h"
+PathTrie::PathTrie() {
+  log_prob_b_prev = -NUM_FLT_INF;
+  log_prob_nb_prev = -NUM_FLT_INF;
+  log_prob_b_cur = -NUM_FLT_INF;
+  log_prob_nb_cur = -NUM_FLT_INF;
+  score = -NUM_FLT_INF;
+  ROOT_ = -1;
+  character = ROOT_;
+  exists_ = true;
+  parent = nullptr;
+  dictionary_ = nullptr;
+  dictionary_state_ = 0;
+  has_dictionary_ = false;
+  matcher_ = nullptr;
+}
+PathTrie::~PathTrie() {
+  for (auto child : children_) {
+    delete child.second;
+  }
+}
+PathTrie* PathTrie::get_path_trie(int new_char, bool reset) {
+  auto child = children_.begin();
+  for (child = children_.begin(); child != children_.end(); ++child) {
+    if (child->first == new_char) {
+      break;
+    }
+  }
+  if (child != children_.end()) {
+    if (!child->second->exists_) {
+      child->second->exists_ = true;
+      child->second->log_prob_b_prev = -NUM_FLT_INF;
+      child->second->log_prob_nb_prev = -NUM_FLT_INF;
+      child->second->log_prob_b_cur = -NUM_FLT_INF;
+      child->second->log_prob_nb_cur = -NUM_FLT_INF;
+    }
+    return (child->second);
+  } else {
+    if (has_dictionary_) {
+      matcher_->SetState(dictionary_state_);
+      bool found = matcher_->Find(new_char);
+      if (!found) {
+        // Adding this character causes word outside dictionary
+        auto FSTZERO = fst::TropicalWeight::Zero();
+        auto final_weight = dictionary_->Final(dictionary_state_);
+        bool is_final = (final_weight != FSTZERO);
+        if (is_final && reset) {
+          dictionary_state_ = dictionary_->Start();
+        }
+        return nullptr;
+      } else {
+        PathTrie* new_path = new PathTrie;
+        new_path->character = new_char;
+        new_path->parent = this;
+        new_path->dictionary_ = dictionary_;
+        new_path->dictionary_state_ = matcher_->Value().nextstate;
+        new_path->has_dictionary_ = true;
+        new_path->matcher_ = matcher_;
+        children_.push_back(std::make_pair(new_char, new_path));
+        return new_path;
+      }
+    } else {
+      PathTrie* new_path = new PathTrie;
+      new_path->character = new_char;
+      new_path->parent = this;
+      children_.push_back(std::make_pair(new_char, new_path));
+      return new_path;
+    }
+  }
+}
+PathTrie* PathTrie::get_path_vec(std::vector<int>& output) {
+  return get_path_vec(output, ROOT_);
+}
+PathTrie* PathTrie::get_path_vec(std::vector<int>& output,
+                                 int stop,
+                                 size_t max_steps) {
+  if (character == stop || character == ROOT_ || output.size() == max_steps) {
+    std::reverse(output.begin(), output.end());
+    return this;
+  } else {
+    output.push_back(character);
+    return parent->get_path_vec(output, stop, max_steps);
+  }
+}
+void PathTrie::iterate_to_vec(std::vector<PathTrie*>& output) {
+  if (exists_) {
+    log_prob_b_prev = log_prob_b_cur;
+    log_prob_nb_prev = log_prob_nb_cur;
+    log_prob_b_cur = -NUM_FLT_INF;
+    log_prob_nb_cur = -NUM_FLT_INF;
+    score = log_sum_exp(log_prob_b_prev, log_prob_nb_prev);
+    output.push_back(this);
+  }
+  for (auto child : children_) {
+    child.second->iterate_to_vec(output);
+  }
+}
+void PathTrie::remove() {
+  exists_ = false;
+  if (children_.size() == 0) {
+    auto child = parent->children_.begin();
+    for (child = parent->children_.begin(); child != parent->children_.end();
+         ++child) {
+      if (child->first == character) {
+        parent->children_.erase(child);
+        break;
+      }
+    }
+    if (parent->children_.size() == 0 && !parent->exists_) {
+      parent->remove();
+    }
+    delete this;
+  }
+}
+void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) {
+  dictionary_ = dictionary;
+  dictionary_state_ = dictionary->Start();
+  has_dictionary_ = true;
+}
+using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+void PathTrie::set_matcher(std::shared_ptr<FSTMATCH> matcher) {
+  matcher_ = matcher;
+}
--- a/decoders/swig/path_trie.h
+++ b/decoders/swig/path_trie.h
+#ifndef PATH_TRIE_H
+#define PATH_TRIE_H
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "fst/fstlib.h"
+/* Trie tree for prefix storing and manipulating, with a dictionary in
+ * finite-state transducer for spelling correction.
+ */
+class PathTrie {
+public:
+  PathTrie();
+  ~PathTrie();
+  // get new prefix after appending new char
+  PathTrie* get_path_trie(int new_char, bool reset = true);
+  // get the prefix in index from root to current node
+  PathTrie* get_path_vec(std::vector<int>& output);
+  // get the prefix in index from some stop node to current nodel
+  PathTrie* get_path_vec(std::vector<int>& output,
+                         int stop,
+                         size_t max_steps = std::numeric_limits<size_t>::max());
+  // update log probs
+  void iterate_to_vec(std::vector<PathTrie*>& output);
+  // set dictionary for FST
+  void set_dictionary(fst::StdVectorFst* dictionary);
+  void set_matcher(std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>>);
+  bool is_empty() { return ROOT_ == character; }
+  // remove current path from root
+  void remove();
+  float log_prob_b_prev;
+  float log_prob_nb_prev;
+  float log_prob_b_cur;
+  float log_prob_nb_cur;
+  float score;
+  float approx_ctc;
+  int character;
+  PathTrie* parent;
+private:
+  int ROOT_;
+  bool exists_;
+  bool has_dictionary_;
+  std::vector<std::pair<int, PathTrie*>> children_;
+  // pointer to dictionary of FST
+  fst::StdVectorFst* dictionary_;
+  fst::StdVectorFst::StateId dictionary_state_;
+  // true if finding ars in FST
+  std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>> matcher_;
+};
+#endif  // PATH_TRIE_H
--- a/decoders/swig/scorer.cpp
+++ b/decoders/swig/scorer.cpp
+#include "scorer.h"
+#include <unistd.h>
+#include <iostream>
+#include "lm/config.hh"
+#include "lm/model.hh"
+#include "lm/state.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "decoder_utils.h"
+using namespace lm::ngram;
+Scorer::Scorer(double alpha,
+               double beta,
+               const std::string& lm_path,
+               const std::vector<std::string>& vocab_list) {
+  this->alpha = alpha;
+  this->beta = beta;
+  dictionary = nullptr;
+  is_character_based_ = true;
+  language_model_ = nullptr;
+  max_order_ = 0;
+  dict_size_ = 0;
+  SPACE_ID_ = -1;
+  setup(lm_path, vocab_list);
+}
+Scorer::~Scorer() {
+  if (language_model_ != nullptr) {
+    delete static_cast<lm::base::Model*>(language_model_);
+  }
+  if (dictionary != nullptr) {
+    delete static_cast<fst::StdVectorFst*>(dictionary);
+  }
+}
+void Scorer::setup(const std::string& lm_path,
+                   const std::vector<std::string>& vocab_list) {
+  // load language model
+  load_lm(lm_path);
+  // set char map for scorer
+  set_char_map(vocab_list);
+  // fill the dictionary for FST
+  if (!is_character_based()) {
+    fill_dictionary(true);
+  }
+}
+void Scorer::load_lm(const std::string& lm_path) {
+  const char* filename = lm_path.c_str();
+  VALID_CHECK_EQ(access(filename, F_OK), 0, "Invalid language model path");
+  RetriveStrEnumerateVocab enumerate;
+  lm::ngram::Config config;
+  config.enumerate_vocab = &enumerate;
+  language_model_ = lm::ngram::LoadVirtual(filename, config);
+  max_order_ = static_cast<lm::base::Model*>(language_model_)->Order();
+  vocabulary_ = enumerate.vocabulary;
+  for (size_t i = 0; i < vocabulary_.size(); ++i) {
+    if (is_character_based_ && vocabulary_[i] != UNK_TOKEN &&
+        vocabulary_[i] != START_TOKEN && vocabulary_[i] != END_TOKEN &&
+        get_utf8_str_len(enumerate.vocabulary[i]) > 1) {
+      is_character_based_ = false;
+    }
+  }
+}
+double Scorer::get_log_cond_prob(const std::vector<std::string>& words) {
+  lm::base::Model* model = static_cast<lm::base::Model*>(language_model_);
+  double cond_prob;
+  lm::ngram::State state, tmp_state, out_state;
+  // avoid to inserting <s> in begin
+  model->NullContextWrite(&state);
+  for (size_t i = 0; i < words.size(); ++i) {
+    lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]);
+    // encounter OOV
+    if (word_index == 0) {
+      return OOV_SCORE;
+    }
+    cond_prob = model->BaseScore(&state, word_index, &out_state);
+    tmp_state = state;
+    state = out_state;
+    out_state = tmp_state;
+  }
+  // return  log10 prob
+  return cond_prob;
+}
+double Scorer::get_sent_log_prob(const std::vector<std::string>& words) {
+  std::vector<std::string> sentence;
+  if (words.size() == 0) {
+    for (size_t i = 0; i < max_order_; ++i) {
+      sentence.push_back(START_TOKEN);
+    }
+  } else {
+    for (size_t i = 0; i < max_order_ - 1; ++i) {
+      sentence.push_back(START_TOKEN);
+    }
+    sentence.insert(sentence.end(), words.begin(), words.end());
+  }
+  sentence.push_back(END_TOKEN);
+  return get_log_prob(sentence);
+}
+double Scorer::get_log_prob(const std::vector<std::string>& words) {
+  assert(words.size() > max_order_);
+  double score = 0.0;
+  for (size_t i = 0; i < words.size() - max_order_ + 1; ++i) {
+    std::vector<std::string> ngram(words.begin() + i,
+                                   words.begin() + i + max_order_);
+    score += get_log_cond_prob(ngram);
+  }
+  return score;
+}
+void Scorer::reset_params(float alpha, float beta) {
+  this->alpha = alpha;
+  this->beta = beta;
+}
+std::string Scorer::vec2str(const std::vector<int>& input) {
+  std::string word;
+  for (auto ind : input) {
+    word += char_list_[ind];
+  }
+  return word;
+}
+std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
+  if (labels.empty()) return {};
+  std::string s = vec2str(labels);
+  std::vector<std::string> words;
+  if (is_character_based_) {
+    words = split_utf8_str(s);
+  } else {
+    words = split_str(s, " ");
+  }
+  return words;
+}
+void Scorer::set_char_map(const std::vector<std::string>& char_list) {
+  char_list_ = char_list;
+  char_map_.clear();
+  for (size_t i = 0; i < char_list_.size(); i++) {
+    if (char_list_[i] == " ") {
+      SPACE_ID_ = i;
+      char_map_[' '] = i;
+    } else if (char_list_[i].size() == 1) {
+      char_map_[char_list_[i][0]] = i;
+    }
+  }
+}
+std::vector<std::string> Scorer::make_ngram(PathTrie* prefix) {
+  std::vector<std::string> ngram;
+  PathTrie* current_node = prefix;
+  PathTrie* new_node = nullptr;
+  for (int order = 0; order < max_order_; order++) {
+    std::vector<int> prefix_vec;
+    if (is_character_based_) {
+      new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1);
+      current_node = new_node;
+    } else {
+      new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_);
+      current_node = new_node->parent;  // Skipping spaces
+    }
+    // reconstruct word
+    std::string word = vec2str(prefix_vec);
+    ngram.push_back(word);
+    if (new_node->character == -1) {
+      // No more spaces, but still need order
+      for (int i = 0; i < max_order_ - order - 1; i++) {
+        ngram.push_back(START_TOKEN);
+      }
+      break;
+    }
+  }
+  std::reverse(ngram.begin(), ngram.end());
+  return ngram;
+}
+void Scorer::fill_dictionary(bool add_space) {
+  fst::StdVectorFst dictionary;
+  // First reverse char_list so ints can be accessed by chars
+  std::unordered_map<std::string, int> char_map;
+  for (size_t i = 0; i < char_list_.size(); i++) {
+    char_map[char_list_[i]] = i;
+  }
+  // For each unigram convert to ints and put in trie
+  int dict_size = 0;
+  for (const auto& word : vocabulary_) {
+    bool added = add_word_to_dictionary(
+        word, char_map, add_space, SPACE_ID_, &dictionary);
+    dict_size += added ? 1 : 0;
+  }
+  dict_size_ = dict_size;
+  /* Simplify FST
+   * This gets rid of "epsilon" transitions in the FST.
+   * These are transitions that don't require a string input to be taken.
+   * Getting rid of them is necessary to make the FST determinisitc, but
+   * can greatly increase the size of the FST
+   */
+  fst::RmEpsilon(&dictionary);
+  fst::StdVectorFst* new_dict = new fst::StdVectorFst;
+  /* This makes the FST deterministic, meaning for any string input there's
+   * only one possible state the FST could be in.  It is assumed our
+   * dictionary is deterministic when using it.
+   * (lest we'd have to check for multiple transitions at each state)
+   */
+  fst::Determinize(dictionary, new_dict);
+  /* Finds the simplest equivalent fst. This is unnecessary but decreases
+   * memory usage of the dictionary
+   */
+  fst::Minimize(new_dict);
+  this->dictionary = new_dict;
+}
--- a/decoders/swig/scorer.h
+++ b/decoders/swig/scorer.h
+#ifndef SCORER_H_
+#define SCORER_H_
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lm/enumerate_vocab.hh"
+#include "lm/virtual_interface.hh"
+#include "lm/word_index.hh"
+#include "util/string_piece.hh"
+#include "path_trie.h"
+const double OOV_SCORE = -1000.0;
+const std::string START_TOKEN = "<s>";
+const std::string UNK_TOKEN = "<unk>";
+const std::string END_TOKEN = "</s>";
+// Implement a callback to retrive the dictionary of language model.
+class RetriveStrEnumerateVocab : public lm::EnumerateVocab {
+public:
+  RetriveStrEnumerateVocab() {}
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    vocabulary.push_back(std::string(str.data(), str.length()));
+  }
+  std::vector<std::string> vocabulary;
+};
+/* External scorer to query score for n-gram or sentence, including language
+ * model scoring and word insertion.
+ *
+ * Example:
+ *     Scorer scorer(alpha, beta, "path_of_language_model");
+ *     scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });
+ *     scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });
+ */
+class Scorer {
+public:
+  Scorer(double alpha,
+         double beta,
+         const std::string &lm_path,
+         const std::vector<std::string> &vocabulary);
+  ~Scorer();
+  double get_log_cond_prob(const std::vector<std::string> &words);
+  double get_sent_log_prob(const std::vector<std::string> &words);
+  // return the max order
+  size_t get_max_order() const { return max_order_; }
+  // return the dictionary size of language model
+  size_t get_dict_size() const { return dict_size_; }
+  // retrun true if the language model is character based
+  bool is_character_based() const { return is_character_based_; }
+  // reset params alpha & beta
+  void reset_params(float alpha, float beta);
+  // make ngram for a given prefix
+  std::vector<std::string> make_ngram(PathTrie *prefix);
+  // trransform the labels in index to the vector of words (word based lm) or
+  // the vector of characters (character based lm)
+  std::vector<std::string> split_labels(const std::vector<int> &labels);
+  // language model weight
+  double alpha;
+  // word insertion weight
+  double beta;
+  // pointer to the dictionary of FST
+  void *dictionary;
+protected:
+  // necessary setup: load language model, set char map, fill FST's dictionary
+  void setup(const std::string &lm_path,
+             const std::vector<std::string> &vocab_list);
+  // load language model from given path
+  void load_lm(const std::string &lm_path);
+  // fill dictionary for FST
+  void fill_dictionary(bool add_space);
+  // set char map
+  void set_char_map(const std::vector<std::string> &char_list);
+  double get_log_prob(const std::vector<std::string> &words);
+  // translate the vector in index to string
+  std::string vec2str(const std::vector<int> &input);
+private:
+  void *language_model_;
+  bool is_character_based_;
+  size_t max_order_;
+  size_t dict_size_;
+  int SPACE_ID_;
+  std::vector<std::string> char_list_;
+  std::unordered_map<char, int> char_map_;
+  std::vector<std::string> vocabulary_;
+};
+#endif  // SCORER_H_
--- a/decoders/swig/setup.py
+++ b/decoders/swig/setup.py
+"""Script to build and install decoder package."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from setuptools import setup, Extension, distutils
+import glob
+import platform
+import os, sys
+import multiprocessing.pool
+import argparse
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--num_processes",
+    default=1,
+    type=int,
+    help="Number of cpu processes to build package. (default: %(default)d)")
+args = parser.parse_known_args()
+# reconstruct sys.argv to pass to setup below
+sys.argv = [sys.argv[0]] + args[1]
+# monkey-patch for parallel compilation
+# See: https://stackoverflow.com/a/13176803
+def parallelCCompile(self,
+                     sources,
+                     output_dir=None,
+                     macros=None,
+                     include_dirs=None,
+                     debug=0,
+                     extra_preargs=None,
+                     extra_postargs=None,
+                     depends=None):
+    # those lines are copied from distutils.ccompiler.CCompiler directly
+    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
+        output_dir, macros, include_dirs, sources, depends, extra_postargs)
+    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
+    # parallel code
+    def _single_compile(obj):
+        try:
+            src, ext = build[obj]
+        except KeyError:
+            return
+        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+    # convert to list, imap is evaluated on-demand
+    thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes)
+    list(thread_pool.imap(_single_compile, objects))
+    return objects
+def compile_test(header, library):
+    dummy_path = os.path.join(os.path.dirname(__file__), "dummy")
+    command = "bash -c \"g++ -include " + header \
+                + " -l" + library + " -x c++ - <<<'int main() {}' -o " \
+                + dummy_path + " >/dev/null 2>/dev/null && rm " \
+                + dummy_path + " 2>/dev/null\""
+    return os.system(command) == 0
+# hack compile to support parallel compiling
+distutils.ccompiler.CCompiler.compile = parallelCCompile
+FILES = glob.glob('kenlm/util/*.cc') \
+        + glob.glob('kenlm/lm/*.cc') \
+        + glob.glob('kenlm/util/double-conversion/*.cc')
+FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
+# FILES + glob.glob('glog/src/*.cc')
+FILES = [
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
+]
+LIBS = ['stdc++']
+if platform.system() != 'Darwin':
+    LIBS.append('rt')
+ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11']
+if compile_test('zlib.h', 'z'):
+    ARGS.append('-DHAVE_ZLIB')
+    LIBS.append('z')
+if compile_test('bzlib.h', 'bz2'):
+    ARGS.append('-DHAVE_BZLIB')
+    LIBS.append('bz2')
+if compile_test('lzma.h', 'lzma'):
+    ARGS.append('-DHAVE_XZLIB')
+    LIBS.append('lzma')
+os.system('swig -python -c++ ./decoders.i')
+decoders_module = [
+    Extension(
+        name='_swig_decoders',
+        sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'),
+        language='c++',
+        include_dirs=[
+            '.',
+            'kenlm',
+            'openfst-1.6.3/src/include',
+            'ThreadPool',
+            #'glog/src'
+        ],
+        libraries=LIBS,
+        extra_compile_args=ARGS)
+]
+setup(
+    name='swig_decoders',
+    version='0.1',
+    description="""CTC decoders""",
+    ext_modules=decoders_module,
+    py_modules=['swig_decoders'], )
--- a/decoders/swig/setup.sh
+++ b/decoders/swig/setup.sh
+#!/usr/bin/env bash
+if [ ! -d kenlm ]; then
+    git clone https://github.com/luotao1/kenlm.git
+    echo -e "\n"
+fi
+if [ ! -d openfst-1.6.3 ]; then
+    echo "Download and extract openfst ..."
+    wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz
+    tar -xzvf openfst-1.6.3.tar.gz
+    echo -e "\n"
+fi
+if [ ! -d ThreadPool ]; then
+    git clone https://github.com/progschj/ThreadPool.git
+    echo -e "\n"
+fi
+echo "Install decoders ..."
+python setup.py install --num_processes 4
--- a/decoders/swig_wrapper.py
+++ b/decoders/swig_wrapper.py
+"""Wrapper for various CTC decoders in SWIG."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import swig_decoders
+class Scorer(swig_decoders.Scorer):
+    """Wrapper for Scorer.
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                 count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: basestring
+    """
+    def __init__(self, alpha, beta, model_path, vocabulary):
+        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """Wrapper for ctc best path decoder in swig.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: basestring
+    """
+    return swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
+def ctc_beam_search_decoder(probs_seq,
+                            vocabulary,
+                            beam_size,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None):
+    """Wrapper for the CTC Beam Search Decoder.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_func: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    return swig_decoders.ctc_beam_search_decoder(probs_seq.tolist(), vocabulary,
+                                                 beam_size, cutoff_prob,
+                                                 cutoff_top_n, ext_scoring_func)
+def ctc_beam_search_decoder_batch(probs_split,
+                                  vocabulary,
+                                  beam_size,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None):
+    """Wrapper for the batched CTC beam search decoder.
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in vocabulary pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
+    return swig_decoders.ctc_beam_search_decoder_batch(
+        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func)
--- a/tests/test_decoders.py
+++ b/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-from decoder import *
+from decoders import decoders_deprecated as decoder
 class TestDecoders(unittest.TestCase):
@@ -49,39 +49,38 @@ class TestDecoders(unittest.TestCase):
            0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
            0.05294827, 0.22298418
        ]]
-        self.best_path_result = ["ac'bdc", "b'da"]
+        self.greedy_result = ["ac'bdc", "b'da"]
        self.beam_search_result = ['acdc', "b'a"]
-    def test_best_path_decoder_1(self):
+    def test_greedy_decoder_1(self):
-        bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list)
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq1,
-        self.assertEqual(bst_result, self.best_path_result[0])
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[0])
-    def test_best_path_decoder_2(self):
+    def test_greedy_decoder_2(self):
-        bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list)
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq2,
-        self.assertEqual(bst_result, self.best_path_result[1])
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[1])
    def test_beam_search_decoder_1(self):
-        beam_result = ctc_beam_search_decoder(
+        beam_result = decoder.ctc_beam_search_decoder(
            probs_seq=self.probs_seq1,
            beam_size=self.beam_size,
-            vocabulary=self.vocab_list,
+            vocabulary=self.vocab_list)
-            blank_id=len(self.vocab_list))
        self.assertEqual(beam_result[0][1], self.beam_search_result[0])
    def test_beam_search_decoder_2(self):
-        beam_result = ctc_beam_search_decoder(
+        beam_result = decoder.ctc_beam_search_decoder(
            probs_seq=self.probs_seq2,
            beam_size=self.beam_size,
-            vocabulary=self.vocab_list,
+            vocabulary=self.vocab_list)
-            blank_id=len(self.vocab_list))
        self.assertEqual(beam_result[0][1], self.beam_search_result[1])
    def test_beam_search_decoder_batch(self):
-        beam_results = ctc_beam_search_decoder_batch(
+        beam_results = decoder.ctc_beam_search_decoder_batch(
            probs_split=[self.probs_seq1, self.probs_seq2],
            beam_size=self.beam_size,
            vocabulary=self.vocab_list,
-            blank_id=len(self.vocab_list),
            num_processes=24)
        self.assertEqual(beam_results[0][0][1], self.beam_search_result[0])
        self.assertEqual(beam_results[1][0][1], self.beam_search_result[1])

--- a/demo_client.py
+++ b/demo_client.py
--- a/demo_server.py
+++ b/demo_server.py
@@ -3,111 +3,64 @@ import os
 import time
 import random
 import argparse
-import distutils.util
+import functools
 from time import gmtime, strftime
 import SocketServer
 import struct
 import wave
 import paddle.v2 as paddle
-from utils import print_arguments
+import _init_paths
 from data_utils.data import DataGenerator
-from model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.utils import read_manifest
+from utils.utility import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
+add_arg = functools.partial(add_arguments, argparser=parser)
-    "--host_ip",
+# yapf: disable
-    default="localhost",
+add_arg('host_port',        int,    8086,    "Server's IP port.")
-    type=str,
+add_arg('beam_size',        int,    500,    "Beam search width.")
-    help="Server IP address. (default: %(default)s)")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
-parser.add_argument(
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
-    "--host_port",
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-    default=8086,
+add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
-    type=int,
+add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
-    help="Server Port. (default: %(default)s)")
+add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
-parser.add_argument(
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
-    "--speech_save_dir",
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
-    default="demo_cache",
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
-    type=str,
+                                            "bi-directional RNNs. Not for GRU.")
-    help="Directory for saving demo speech. (default: %(default)s)")
+add_arg('host_ip',          str,
-parser.add_argument(
+        'localhost',
-    "--vocab_filepath",
+        "Server's IP address.")
-    default='datasets/vocab/eng_vocab.txt',
+add_arg('speech_save_dir',  str,
-    type=str,
+        'demo_cache',
-    help="Vocabulary filepath. (default: %(default)s)")
+        "Directory to save demo audios.")
-parser.add_argument(
+add_arg('warmup_manifest',  str,
-    "--mean_std_filepath",
+        'data/librispeech/manifest.test-clean',
-    default='mean_std.npz',
+        "Filepath of manifest to warm up.")
-    type=str,
+add_arg('mean_std_path',    str,
-    help="Manifest path for normalizer. (default: %(default)s)")
+        'data/librispeech/mean_std.npz',
-parser.add_argument(
+        "Filepath of normalizer's mean & std.")
-    "--warmup_manifest_path",
+add_arg('vocab_path',       str,
-    default='datasets/manifest.test',
+        'data/librispeech/eng_vocab.txt',
-    type=str,
+        "Filepath of vocabulary.")
-    help="Manifest path for warmup test. (default: %(default)s)")
+add_arg('model_path',       str,
-parser.add_argument(
+        './checkpoints/libri/params.latest.tar.gz',
-    "--specgram_type",
+        "If None, the training starts from scratch, "
-    default='linear',
+        "otherwise, it resumes from the pre-trained model.")
-    type=str,
+add_arg('lang_model_path',  str,
-    help="Feature type of audio data: 'linear' (power spectrum)"
+        'lm/data/common_crawl_00.prune01111.trie.klm',
-    " or 'mfcc'. (default: %(default)s)")
+        "Filepath for language model.")
-parser.add_argument(
+add_arg('decoding_method',  str,
-    "--num_conv_layers",
+        'ctc_beam_search',
-    default=2,
+        "Decoding method. Options: ctc_beam_search, ctc_greedy",
-    type=int,
+        choices = ['ctc_beam_search', 'ctc_greedy'])
-    help="Convolution layer number. (default: %(default)s)")
+add_arg('specgram_type',    str,
-parser.add_argument(
+        'linear',
-    "--num_rnn_layers",
+        "Audio feature type. Options: linear, mfcc.",
-    default=3,
+        choices=['linear', 'mfcc'])
-    type=int,
+# yapf: disable
-    help="RNN layer number. (default: %(default)s)")
-parser.add_argument(
-    "--rnn_layer_size",
-    default=512,
-    type=int,
-    help="RNN layer cell number. (default: %(default)s)")
-parser.add_argument(
-    "--use_gpu",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use gpu or not. (default: %(default)s)")
-parser.add_argument(
-    "--model_filepath",
-    default='checkpoints/params.latest.tar.gz',
-    type=str,
-    help="Model filepath. (default: %(default)s)")
-parser.add_argument(
-    "--decode_method",
-    default='beam_search',
-    type=str,
-    help="Method for ctc decoding: best_path or beam_search. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--beam_size",
-    default=100,
-    type=int,
-    help="Width for beam search decoding. (default: %(default)d)")
-parser.add_argument(
-    "--language_model_path",
-    default="lm/data/common_crawl_00.prune01111.trie.klm",
-    type=str,
-    help="Path for language model. (default: %(default)s)")
-parser.add_argument(
-    "--alpha",
-    default=0.36,
-    type=float,
-    help="Parameter associated with language model. (default: %(default)f)")
-parser.add_argument(
-    "--beta",
-    default=0.25,
-    type=float,
-    help="Parameter associated with word count. (default: %(default)f)")
-parser.add_argument(
-    "--cutoff_prob",
-    default=0.99,
-    type=float,
-    help="The cutoff probability of pruning"
-    "in beam search. (default: %(default)f)")
 args = parser.parse_args()
@@ -147,7 +100,7 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler):
        finish_time = time.time()
        print("Response Time: %f, Transcript: %s" %
              (finish_time - start_time, transcript))
-        self.request.sendall(transcript)
+        self.request.sendall(transcript.encode('utf-8'))
    def _write_to_file(self, data):
        # prepare save dir and filename
@@ -188,8 +141,8 @@ def start_server():
    """Start the ASR server"""
    # prepare data generator
    data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
+        vocab_filepath=args.vocab_path,
-        mean_std_filepath=args.mean_std_filepath,
+        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1)
@@ -199,20 +152,22 @@ def start_server():
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
-        pretrained_model_path=args.model_filepath)
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        result_transcript = ds2_model.infer_batch(
            infer_data=[feature],
-            decode_method=args.decode_method,
+            decoding_method=args.decoding_method,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            vocab_list=data_generator.vocab_list,
-            language_model_path=args.language_model_path,
+            language_model_path=args.lang_model_path,
            num_processes=1)
        return result_transcript[0]
@@ -221,7 +176,7 @@ def start_server():
    print('Warming up ...')
    warm_up_test(
        audio_process_handler=file_to_transcript,
-        manifest_path=args.warmup_manifest_path,
+        manifest_path=args.warmup_manifest,
        num_test_cases=3)
    print('-----------------------------------------------------------')

--- a/docs/images/multi_gpu_speedup.png
+++ b/docs/images/multi_gpu_speedup.png
--- a/docs/images/tuning_error_surface.png
+++ b/docs/images/tuning_error_surface.png
--- a/examples/aishell/run_data.sh
+++ b/examples/aishell/run_data.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download data, generate manifests
+PYTHONPATH=.:$PYTHONPATH python data/aishell/aishell.py \
+--manifest_prefix='data/aishell/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/Aishell'
+if [ $? -ne 0 ]; then
+    echo "Prepare Aishell failed. Terminated."
+    exit 1
+fi
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/aishell/vocab.txt' \
+--manifest_paths='data/aishell/manifest.train'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/aishell/manifest.train' \
+--num_samples=2000 \
+--specgram_type='linear' \
+--output_path='data/aishell/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "Aishell data preparation done."
+exit 0
--- a/examples/aishell/run_train.sh
+++ b/examples/aishell/run_train.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u train.py \
+--batch_size=64 \
+--trainer_count=8 \
+--num_passes=50 \
+--num_proc_data=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=1024 \
+--num_iter_print=100 \
+--learning_rate=5e-4 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--test_off=False \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=False \
+--train_manifest='data/aishell/manifest.train' \
+--dev_manifest='data/aishell/manifest.dev' \
+--mean_std_path='data/aishell/mean_std.npz' \
+--vocab_path='data/aishell/vocab.txt' \
+--output_model_dir='./checkpoints/aishell' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_data.sh
+++ b/examples/librispeech/run_data.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download data, generate manifests
+PYTHONPATH=.:$PYPYTHONPATH python data/librispeech/librispeech.py \
+--manifest_prefix='data/librispeech/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/Libri' \
+--full_download='True'
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/librispeech/vocab.txt' \
+--manifest_paths='data/librispeech/manifest.train'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/librispeech/manifest.train' \
+--num_samples=2000 \
+--specgram_type='linear' \
+--output_path='data/librispeech/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "LibriSpeech Data preparation done."
+exit 0
--- a/examples/librispeech/run_infer.sh
+++ b/examples/librispeech/run_infer.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_infer_golden.sh
+++ b/examples/librispeech/run_infer_golden.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_test.sh
+++ b/examples/librispeech/run_test.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_test_golden.sh
+++ b/examples/librispeech/run_test_golden.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--cutoff_top_n=40 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_train.sh
+++ b/examples/librispeech/run_train.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u train.py \
+--batch_size=512 \
+--trainer_count=8 \
+--num_passes=50 \
+--num_proc_data=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=5e-4 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--test_off=False \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/librispeech/manifest.train' \
+--dev_manifest='data/librispeech/manifest.dev' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/vocab.txt' \
+--output_model_dir='./checkpoints/libri' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_tune.sh
+++ b/examples/librispeech/run_tune.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/librispeech/manifest.dev-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/examples/mandarin/run_demo_client.sh
+++ b/examples/mandarin/run_demo_client.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# start demo client
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_client.py \
+--host_ip='localhost' \
+--host_port=8086 \
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo client!"
+    exit 1
+fi
+exit 0
--- a/examples/mandarin/run_demo_server.sh
+++ b/examples/mandarin/run_demo_server.sh
+#! /usr/bin/env bash
+# TODO: replace the model with a mandarin model
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# start demo server
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_server.py \
+--host_ip='localhost' \
+--host_port=8086 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--speech_save_dir='demo_cache' \
+--warmup_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo server!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_data.sh
+++ b/examples/tiny/run_data.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# prepare folder
+if [ ! -e data/tiny ]; then
+    mkdir data/tiny
+fi
+# download data, generate manifests
+python data/librispeech/librispeech.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/libri' \
+--full_download='False'
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.dev'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.tiny' \
+--num_samples=64 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "Tiny data preparation done."
+exit 0
--- a/examples/tiny/run_infer.sh
+++ b/examples/tiny/run_infer.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_infer_golden.sh
+++ b/examples/tiny/run_infer_golden.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_test.sh
+++ b/examples/tiny/run_test.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=16 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_test_golden.sh
+++ b/examples/tiny/run_test_golden.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_train.sh
+++ b/examples/tiny/run_train.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python -u train.py \
+--batch_size=16 \
+--trainer_count=4 \
+--num_passes=20 \
+--num_proc_data=1 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=1e-5 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--test_off=False \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--output_model_dir='./checkpoints/tiny' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Fail to do inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_tune.sh
+++ b/examples/tiny/run_tune.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-9.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/infer.py
+++ b/infer.py
@@ -4,126 +4,73 @@ from __future__ import division
 from __future__ import print_function
 import argparse
-import distutils.util
+import functools
-import multiprocessing
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
-from error_rate import wer
+from utils.error_rate import wer, cer
-import utils
+from utils.utility import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
+add_arg = functools.partial(add_arguments, argparser=parser)
-    "--num_samples",
+# yapf: disable
-    default=10,
+add_arg('num_samples',      int,    10,     "# of samples to infer.")
-    type=int,
+add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
-    help="Number of samples for inference. (default: %(default)s)")
+add_arg('beam_size',        int,    500,    "Beam search width.")
-parser.add_argument(
+add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
-    "--num_conv_layers",
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
-    default=2,
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
-    type=int,
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
-    help="Convolution layer number. (default: %(default)s)")
+add_arg('alpha',            float,  2.15,   "Coef of LM for beam search.")
-parser.add_argument(
+add_arg('beta',             float,  0.35,   "Coef of WC for beam search.")
-    "--num_rnn_layers",
+add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
-    default=3,
+add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
-    type=int,
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
-    help="RNN layer number. (default: %(default)s)")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
-parser.add_argument(
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
-    "--rnn_layer_size",
+                                            "bi-directional RNNs. Not for GRU.")
-    default=512,
+add_arg('infer_manifest',   str,
-    type=int,
+        'data/librispeech/manifest.dev-clean',
-    help="RNN layer cell number. (default: %(default)s)")
+        "Filepath of manifest to infer.")
-parser.add_argument(
+add_arg('mean_std_path',    str,
-    "--use_gpu",
+        'data/librispeech/mean_std.npz',
-    default=True,
+        "Filepath of normalizer's mean & std.")
-    type=distutils.util.strtobool,
+add_arg('vocab_path',       str,
-    help="Use gpu or not. (default: %(default)s)")
+        'data/librispeech/vocab.txt',
-parser.add_argument(
+        "Filepath of vocabulary.")
-    "--num_threads_data",
+add_arg('lang_model_path',  str,
-    default=1,
+        'models/lm/common_crawl_00.prune01111.trie.klm',
-    type=int,
+        "Filepath for language model.")
-    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
+add_arg('model_path',       str,
-parser.add_argument(
+        './checkpoints/libri/params.latest.tar.gz',
-    "--num_processes_beam_search",
+        "If None, the training starts from scratch, "
-    default=multiprocessing.cpu_count() // 2,
+        "otherwise, it resumes from the pre-trained model.")
-    type=int,
+add_arg('decoding_method',  str,
-    help="Number of cpu processes for beam search. (default: %(default)s)")
+        'ctc_beam_search',
-parser.add_argument(
+        "Decoding method. Options: ctc_beam_search, ctc_greedy",
-    "--specgram_type",
+        choices = ['ctc_beam_search', 'ctc_greedy'])
-    default='linear',
+add_arg('error_rate_type',  str,
-    type=str,
+        'wer',
-    help="Feature type of audio data: 'linear' (power spectrum)"
+        "Error rate type for evaluation.",
-    " or 'mfcc'. (default: %(default)s)")
+        choices=['wer', 'cer'])
-parser.add_argument(
+add_arg('specgram_type',    str,
-    "--trainer_count",
+        'linear',
-    default=8,
+        "Audio feature type. Options: linear, mfcc.",
-    type=int,
+        choices=['linear', 'mfcc'])
-    help="Trainer number. (default: %(default)s)")
+# yapf: disable
-parser.add_argument(
-    "--mean_std_filepath",
-    default='mean_std.npz',
-    type=str,
-    help="Manifest path for normalizer. (default: %(default)s)")
-parser.add_argument(
-    "--decode_manifest_path",
-    default='datasets/manifest.test',
-    type=str,
-    help="Manifest path for decoding. (default: %(default)s)")
-parser.add_argument(
-    "--model_filepath",
-    default='checkpoints/params.latest.tar.gz',
-    type=str,
-    help="Model filepath. (default: %(default)s)")
-parser.add_argument(
-    "--vocab_filepath",
-    default='datasets/vocab/eng_vocab.txt',
-    type=str,
-    help="Vocabulary filepath. (default: %(default)s)")
-parser.add_argument(
-    "--decode_method",
-    default='beam_search',
-    type=str,
-    help="Method for ctc decoding: best_path or beam_search. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--beam_size",
-    default=500,
-    type=int,
-    help="Width for beam search decoding. (default: %(default)d)")
-parser.add_argument(
-    "--language_model_path",
-    default="lm/data/common_crawl_00.prune01111.trie.klm",
-    type=str,
-    help="Path for language model. (default: %(default)s)")
-parser.add_argument(
-    "--alpha",
-    default=0.36,
-    type=float,
-    help="Parameter associated with language model. (default: %(default)f)")
-parser.add_argument(
-    "--beta",
-    default=0.25,
-    type=float,
-    help="Parameter associated with word count. (default: %(default)f)")
-parser.add_argument(
-    "--cutoff_prob",
-    default=0.99,
-    type=float,
-    help="The cutoff probability of pruning"
-    "in beam search. (default: %(default)f)")
 args = parser.parse_args()
 def infer():
    """Inference for DeepSpeech2."""
    data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
+        vocab_filepath=args.vocab_path,
-        mean_std_filepath=args.mean_std_filepath,
+        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
-        num_threads=args.num_threads_data)
+        num_threads=1)
    batch_reader = data_generator.batch_reader_creator(
-        manifest_path=args.decode_manifest_path,
+        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
        min_batch_size=1,
        sortagrad=False,
@@ -135,18 +82,26 @@ def infer():
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
-        pretrained_model_path=args.model_filepath)
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
+    # decoders only accept string encoded in utf-8
+    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
    result_transcripts = ds2_model.infer_batch(
        infer_data=infer_data,
-        decode_method=args.decode_method,
+        decoding_method=args.decoding_method,
        beam_alpha=args.alpha,
        beam_beta=args.beta,
        beam_size=args.beam_size,
        cutoff_prob=args.cutoff_prob,
-        vocab_list=data_generator.vocab_list,
+        cutoff_top_n=args.cutoff_top_n,
-        language_model_path=args.language_model_path,
+        vocab_list=vocab_list,
-        num_processes=args.num_processes_beam_search)
+        language_model_path=args.lang_model_path,
+        num_processes=args.num_proc_bsearch)
+    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    target_transcripts = [
        ''.join([data_generator.vocab_list[token] for token in transcript])
        for _, transcript in infer_data
@@ -154,11 +109,13 @@ def infer():
    for target, result in zip(target_transcripts, result_transcripts):
        print("\nTarget Transcription: %s\nOutput Transcription: %s" %
              (target, result))
-        print("Current wer = %f" % wer(target, result))
+        print("Current error rate [%s] = %f" %
+              (args.error_rate_type, error_rate_func(target, result)))
+    ds2_model.logger.info("finish inference")
 def main():
-    utils.print_arguments(args)
+    print_arguments(args)
    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
    infer()

--- a/lm/run.sh
+++ b/lm/run.sh
-echo "Downloading language model ..."
-mkdir data
-LM=common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
-echo "Checking md5sum ..."
-md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
-if [ $MD5 != $md5_tmp ]; then
-    echo "Fail to download the language model!"
-    exit 1
-fi
--- a/model_utils/__init__.py
+++ b/model_utils/__init__.py
--- a/model.py
+++ b/model.py
@@ -6,11 +6,17 @@ from __future__ import print_function
 import sys
 import os
 import time
+import logging
 import gzip
-from decoder import *
+from distutils.dir_util import mkpath
-from lm.lm_scorer import LmScorer
 import paddle.v2 as paddle
-from layer import *
+from decoders.swig_wrapper import Scorer
+from decoders.swig_wrapper import ctc_greedy_decoder
+from decoders.swig_wrapper import ctc_beam_search_decoder_batch
+from model_utils.network import deep_speech_v2_network
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
 class DeepSpeech2Model(object):
@@ -27,16 +33,23 @@ class DeepSpeech2Model(object):
    :param pretrained_model_path: Pretrained model path. If None, will train
                                  from stratch.
    :type pretrained_model_path: basestring|None
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward directional RNNs.Notice that
+                              for GRU, weight sharing is not supported.
+    :type share_rnn_weights: bool
    """
    def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
-                 rnn_layer_size, pretrained_model_path):
+                 rnn_layer_size, use_gru, pretrained_model_path,
+                 share_rnn_weights):
        self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
-                             rnn_layer_size)
+                             rnn_layer_size, use_gru, share_rnn_weights)
        self._create_parameters(pretrained_model_path)
        self._inferer = None
        self._loss_inferer = None
        self._ext_scorer = None
+        self.logger = logging.getLogger("")
+        self.logger.setLevel(level=logging.INFO)
    def train(self,
              train_batch_reader,
@@ -46,7 +59,9 @@ class DeepSpeech2Model(object):
              gradient_clipping,
              num_passes,
              output_model_dir,
-              num_iterations_print=100):
+              is_local=True,
+              num_iterations_print=100,
+              test_off=False):
        """Train the model.
        :param train_batch_reader: Train data reader.
@@ -65,12 +80,16 @@ class DeepSpeech2Model(object):
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type rnn_iteratons_print: int
+        :param is_local: Set to False if running with pserver with multi-nodes.
+        :type is_local: bool
        :param output_model_dir: Directory for saving the model (every pass).
        :type output_model_dir: basestring
+        :param test_off: Turn off testing.
+        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(output_model_dir):
-            os.mkdir(output_model_dir)
+            mkpath(output_model_dir)
        # prepare optimizer and trainer
        optimizer = paddle.optimizer.Adam(
@@ -79,7 +98,8 @@ class DeepSpeech2Model(object):
        trainer = paddle.trainer.SGD(
            cost=self._loss,
            parameters=self._parameters,
-            update_equation=optimizer)
+            update_equation=optimizer,
+            is_local=is_local)
        # create event handler
        def event_handler(event):
@@ -103,14 +123,19 @@ class DeepSpeech2Model(object):
                start_time = time.time()
                cost_sum, cost_counter = 0.0, 0
            if isinstance(event, paddle.event.EndPass):
-                result = trainer.test(
+                if test_off:
-                    reader=dev_batch_reader, feeding=feeding_dict)
+                    print("\n------- Time: %d sec,  Pass: %d" %
+                          (time.time() - start_time, event.pass_id))
+                else:
+                    result = trainer.test(
+                        reader=dev_batch_reader, feeding=feeding_dict)
+                    print("\n------- Time: %d sec,  Pass: %d, "
+                          "ValidationCost: %s" %
+                          (time.time() - start_time, event.pass_id, 0))
                output_model_path = os.path.join(
                    output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
                with gzip.open(output_model_path, 'w') as f:
                    self._parameters.to_tar(f)
-                print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
-                      (time.time() - start_time, event.pass_id, result.cost))
        # run train
        trainer.train(
@@ -137,9 +162,9 @@ class DeepSpeech2Model(object):
        # run inference
        return self._loss_inferer.infer(input=infer_data)
-    def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta,
+    def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
-                    beam_size, cutoff_prob, vocab_list, language_model_path,
+                    beam_size, cutoff_prob, cutoff_top_n, vocab_list,
-                    num_processes):
+                    language_model_path, num_processes):
        """Model inference. Infer the transcription for a batch of speech
        utterances.
@@ -147,9 +172,9 @@ class DeepSpeech2Model(object):
                           consisting of a tuple of audio features and
                           transcription text (empty string).
        :type infer_data: list
-        :param decode_method: Decoding method name, 'best_path' or
+        :param decoding_method: Decoding method name, 'ctc_greedy' or
-                              'beam search'.
+                                'ctc_beam_search'.
-        :param decode_method: string
+        :param decoding_method: string
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
@@ -159,6 +184,10 @@ class DeepSpeech2Model(object):
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
+        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                        characters with highest probs in vocabulary will be
+                        used in beam search, default 40.
+        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param language_model_path: Filepath for language model.
@@ -181,36 +210,47 @@ class DeepSpeech2Model(object):
        ]
        # run decoder
        results = []
-        if decode_method == "best_path":
+        if decoding_method == "ctc_greedy":
            # best path decode
            for i, probs in enumerate(probs_split):
-                output_transcription = ctc_best_path_decoder(
+                output_transcription = ctc_greedy_decoder(
-                    probs_seq=probs, vocabulary=data_generator.vocab_list)
+                    probs_seq=probs, vocabulary=vocab_list)
                results.append(output_transcription)
-        elif decode_method == "beam_search":
+        elif decoding_method == "ctc_beam_search":
            # initialize external scorer
            if self._ext_scorer == None:
-                self._ext_scorer = LmScorer(beam_alpha, beam_beta,
-                                            language_model_path)
                self._loaded_lm_path = language_model_path
+                self.logger.info("begin to initialize the external scorer "
+                                 "for decoding")
+                self._ext_scorer = Scorer(beam_alpha, beam_beta,
+                                          language_model_path, vocab_list)
+                lm_char_based = self._ext_scorer.is_character_based()
+                lm_max_order = self._ext_scorer.get_max_order()
+                lm_dict_size = self._ext_scorer.get_dict_size()
+                self.logger.info("language model: "
+                                 "is_character_based = %d," % lm_char_based +
+                                 " max_order = %d," % lm_max_order +
+                                 " dict_size = %d" % lm_dict_size)
+                self.logger.info("end initializing scorer. Start decoding ...")
            else:
                self._ext_scorer.reset_params(beam_alpha, beam_beta)
                assert self._loaded_lm_path == language_model_path
            # beam search decode
+            num_processes = min(num_processes, len(probs_split))
            beam_search_results = ctc_beam_search_decoder_batch(
                probs_split=probs_split,
                vocabulary=vocab_list,
                beam_size=beam_size,
-                blank_id=len(vocab_list),
                num_processes=num_processes,
                ext_scoring_func=self._ext_scorer,
-                cutoff_prob=cutoff_prob)
+                cutoff_prob=cutoff_prob,
+                cutoff_top_n=cutoff_top_n)
            results = [result[0][1] for result in beam_search_results]
        else:
            raise ValueError("Decoding method [%s] is not supported." %
-                             decode_method)
+                             decoding_method)
        return results
    def _create_parameters(self, model_path=None):
@@ -222,7 +262,7 @@ class DeepSpeech2Model(object):
                gzip.open(model_path))
    def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
-                        rnn_layer_size):
+                        rnn_layer_size, use_gru, share_rnn_weights):
        """Create data layers and model network."""
        # paddle.data_type.dense_array is used for variable batch input.
        # The size 161 * 161 is only an placeholder value and the real shape
@@ -233,10 +273,12 @@ class DeepSpeech2Model(object):
        text_data = paddle.layer.data(
            name="transcript_text",
            type=paddle.data_type.integer_value_sequence(vocab_size))
-        self._log_probs, self._loss = deep_speech2(
+        self._log_probs, self._loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
            dict_size=vocab_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_layer_size)
+            rnn_size=rnn_layer_size,
+            use_gru=use_gru,
+            share_rnn_weights=share_rnn_weights)
--- a/layer.py
+++ b/layer.py
-"""Contains DeepSpeech2 layers."""
+"""Contains DeepSpeech2 layers and networks."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -39,7 +39,7 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
    return paddle.layer.batch_norm(input=conv_layer, act=act)
-def bidirectional_simple_rnn_bn_layer(name, input, size, act):
+def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
@@ -51,23 +51,91 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
    :type size: int
    :param act: Activation type.
    :type act: BaseActivation
+    :param share_weights: Whether to share input-hidden weights between
+                          forward and backward directional RNNs.
+    :type share_weights: bool
    :return: Bidirectional simple rnn layer.
    :rtype: LayerOutput
    """
-    # input-hidden weights shared across bi-direcitonal rnn.
+    if share_weights:
-    input_proj = paddle.layer.fc(
+        # input-hidden weights shared between bi-direcitonal rnn.
-        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
+        input_proj = paddle.layer.fc(
-    # batch norm is only performed on input-state projection 
+            input=input,
-    input_proj_bn = paddle.layer.batch_norm(
+            size=size,
-        input=input_proj, act=paddle.activation.Linear())
+            act=paddle.activation.Linear(),
-    # forward and backward in time
+            bias_attr=False)
-    forward_simple_rnn = paddle.layer.recurrent(
+        # batch norm is only performed on input-state projection
-        input=input_proj_bn, act=act, reverse=False)
+        input_proj_bn = paddle.layer.batch_norm(
-    backward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj, act=paddle.activation.Linear())
-        input=input_proj_bn, act=act, reverse=True)
+        # forward and backward in time
+        forward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn, act=act, reverse=False)
+        backward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn, act=act, reverse=True)
+    else:
+        input_proj_forward = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        input_proj_backward = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        # batch norm is only performed on input-state projection
+        input_proj_bn_forward = paddle.layer.batch_norm(
+            input=input_proj_forward, act=paddle.activation.Linear())
+        input_proj_bn_backward = paddle.layer.batch_norm(
+            input=input_proj_backward, act=paddle.activation.Linear())
+        # forward and backward in time
+        forward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn_forward, act=act, reverse=False)
+        backward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn_backward, act=act, reverse=True)
    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
+def bidirectional_gru_bn_layer(name, input, size, act):
+    """Bidirectonal gru layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+    :param name: Name of the layer.
+    :type name: string
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param size: Number of RNN cells.
+    :type size: int
+    :param act: Activation type.
+    :type act: BaseActivation
+    :return: Bidirectional simple rnn layer.
+    :rtype: LayerOutput
+    """
+    input_proj_forward = paddle.layer.fc(
+        input=input,
+        size=size * 3,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    input_proj_backward = paddle.layer.fc(
+        input=input,
+        size=size * 3,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    # batch norm is only performed on input-related projections
+    input_proj_bn_forward = paddle.layer.batch_norm(
+        input=input_proj_forward, act=paddle.activation.Linear())
+    input_proj_bn_backward = paddle.layer.batch_norm(
+        input=input_proj_backward, act=paddle.activation.Linear())
+    # forward and backward in time
+    forward_gru = paddle.layer.grumemory(
+        input=input_proj_bn_forward, act=act, reverse=False)
+    backward_gru = paddle.layer.grumemory(
+        input=input_proj_bn_backward, act=act, reverse=True)
+    return paddle.layer.concat(input=[forward_gru, backward_gru])
 def conv_group(input, num_stacks):
    """Convolution group with stacked convolution layers.
@@ -100,7 +168,7 @@ def conv_group(input, num_stacks):
    return conv, output_num_channels, output_height
-def rnn_group(input, size, num_stacks):
+def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
    """RNN group with stacked bidirectional simple RNN layers.
    :param input: Input layer.
@@ -109,24 +177,43 @@ def rnn_group(input, size, num_stacks):
    :type size: int
    :param num_stacks: Number of stacked rnn layers.
    :type num_stacks: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward directional RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
    :return: Output layer of the RNN group.
    :rtype: LayerOutput
    """
    output = input
    for i in xrange(num_stacks):
-        output = bidirectional_simple_rnn_bn_layer(
+        if use_gru:
-            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
+            output = bidirectional_gru_bn_layer(
+                name=str(i),
+                input=output,
+                size=size,
+                act=paddle.activation.Relu())
+            # BRelu does not support hppl, need to add later. Use Relu instead.
+        else:
+            output = bidirectional_simple_rnn_bn_layer(
+                name=str(i),
+                input=output,
+                size=size,
+                act=paddle.activation.BRelu(),
+                share_weights=share_rnn_weights)
    return output
-def deep_speech2(audio_data,
+def deep_speech_v2_network(audio_data,
-                 text_data,
+                           text_data,
-                 dict_size,
+                           dict_size,
-                 num_conv_layers=2,
+                           num_conv_layers=2,
-                 num_rnn_layers=3,
+                           num_rnn_layers=3,
-                 rnn_size=256):
+                           rnn_size=256,
-    """
+                           use_gru=False,
-    The whole DeepSpeech2 model structure (a simplified version).
+                           share_rnn_weights=True):
+    """The DeepSpeech2 network structure.
    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
@@ -140,6 +227,12 @@ def deep_speech2(audio_data,
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward direction RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
@@ -157,7 +250,11 @@ def deep_speech2(audio_data,
        block_y=conv_group_height)
    # rnn group
    rnn_group_output = rnn_group(
-        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
+        input=conv2seq,
+        size=rnn_size,
+        num_stacks=num_rnn_layers,
+        use_gru=use_gru,
+        share_rnn_weights=share_rnn_weights)
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,

--- a/models/aishell/download_model.sh
+++ b/models/aishell/download_model.sh
+#! /usr/bin/env bash
+source ../../utils/utility.sh
+URL='http://cloud.dlnel.org/filepub/?uuid=6c83b9d8-3255-4adf-9726-0fe0be3d0274'
+MD5=28521a58552885a81cf92a1e9b133a71
+TARGET=./aishell_model.tar.gz
+echo "Download Aishell model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download Aishell model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+exit 0
--- a/models/librispeech/download_model.sh
+++ b/models/librispeech/download_model.sh
+#! /usr/bin/env bash
+source ../../utils/utility.sh
+URL='http://cloud.dlnel.org/filepub/?uuid=8e3cf742-2ff3-41ce-a49d-f6158cc06a23'
+MD5=2ef08f8b608a7c555592161fc14d81a6
+TARGET=./librispeech_model.tar.gz
+echo "Download LibriSpeech model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download LibriSpeech model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+exit 0
--- a/models/lm/download_lm_ch.sh
+++ b/models/lm/download_lm_ch.sh
+#! /usr/bin/env bash
+source ../../utils/utility.sh
+URL=http://cloud.dlnel.org/filepub/?uuid=d21861e4-4ed6-45bb-ad8e-ae417a43195e
+MD5="29e02312deb2e59b3c8686c7966d4fe3"
+TARGET=./zh_giga.no_cna_cmn.prune01244.klm
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+exit 0
--- a/models/lm/download_lm_en.sh
+++ b/models/lm/download_lm_en.sh
+#! /usr/bin/env bash
+source ../../utils/utility.sh
+URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=./common_crawl_00.prune01111.trie.klm
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+exit 0
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,3 @@ scipy==0.13.1
 resampy==0.1.5
 SoundFile==0.9.0.post1
 python_speech_features
-https://github.com/luotao1/kenlm/archive/master.zip
--- a/setup.sh
+++ b/setup.sh
-#!/bin/bash
+#! /usr/bin/env  bash
 # install python dependencies
 if [ -f "requirements.txt" ]; then
@@ -13,17 +13,26 @@ fi
 python -c "import soundfile"
 if [ $? != 0 ]; then
    echo "Install package libsndfile into default system path."
-    curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+    wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
    if [ $? != 0 ]; then
        echo "Download libsndfile-1.0.28.tar.gz failed !!!"
        exit 1
    fi
    tar -zxvf libsndfile-1.0.28.tar.gz
    cd libsndfile-1.0.28
-    ./configure && make && make install
+    ./configure > /dev/null && make > /dev/null && make install > /dev/null
    cd ..
    rm -rf libsndfile-1.0.28
    rm libsndfile-1.0.28.tar.gz
 fi
+# install decoders
+python -c "import swig_decoders"
+if [ $? != 0 ]; then
+    cd decoders/swig > /dev/null
+    sh setup.sh
+    cd - > /dev/null
+fi
 echo "Install all dependencies successfully."
--- a/evaluate.py
+++ b/evaluate.py
--- a/tests/test_setup.py
+++ b/tests/test_setup.py
-"""Test Setup."""
-import unittest
-import numpy as np
-import os
-class TestSetup(unittest.TestCase):
-    def test_soundfile(self):
-        import soundfile as sf
-        # floating point data is typically limited to the interval [-1.0, 1.0],
-        # but smaller/larger values are supported as well
-        data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
-                         [0.25, -0.25]])
-        file = 'test.wav'
-        sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
-        read, fs = sf.read(file)
-        self.assertTrue(np.all(read == data))
-        self.assertEqual(fs, 44100)
-        os.remove(file)
-if __name__ == '__main__':
-    unittest.main()
--- a/tools/_init_paths.py
+++ b/tools/_init_paths.py
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os.path
+import sys
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+this_dir = os.path.dirname(__file__)
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
--- a/compute_mean_std.py
+++ b/compute_mean_std.py
--- a/tools/profile.sh
+++ b/tools/profile.sh
--- a/tools/tune.py
+++ b/tools/tune.py
--- a/train.py
+++ b/train.py
--- a/tune.py
+++ b/tune.py
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/error_rate.py
+++ b/error_rate.py
--- a/tests/test_error_rate.py
+++ b/tests/test_error_rate.py
--- a/utils.py
+++ b/utils.py
--- a/utils/utility.sh
+++ b/utils/utility.sh