adapt to the new structure

f2120bb5 · Yibing Liu · adab01bb · 351f61e3 · adab01bb · f2120bb5
52 changed file
--- a/.gitignore
+++ b/.gitignore
-manifest*
-mean_std.npz
-thirdparty/
--- a/README.md
+++ b/README.md
--- a/data/librispeech/eng_vocab.txt
+++ b/data/librispeech/eng_vocab.txt
-'
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@@ -19,8 +19,6 @@ import json
 import codecs
 from paddle.v2.dataset.common import md5file
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@@ -60,8 +58,7 @@ args = parser.parse_args()
 def download(url, md5sum, target_dir):
-    """
+    """Download file from url to target_dir, and check md5sum.
-    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir):
 def unpack(filepath, target_dir):
-    """
+    """Unpack the file to the target_dir.
-    Unpack the file to the target_dir.
    """
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
@@ -87,8 +83,7 @@ def unpack(filepath, target_dir):
 def create_manifest(data_dir, manifest_path):
-    """
+    """Create a manifest json file summarizing the data set, with each line
-    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path):
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """
+    """Download, unpack and create summmary manifest file.
-    Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
 def main():
+    args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
@@ -145,12 +141,12 @@ def main():
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
-    prepare_dataset(
-        url=URL_TRAIN_CLEAN_100,
-        md5sum=MD5_TRAIN_CLEAN_100,
-        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,

--- a/lm/__init__.py
+++ b/lm/__init__.py
--- a/models/swig_decoders/_init_paths.py
+++ b/models/swig_decoders/_init_paths.py
--- a/models/swig_decoders/ctc_decoders.cpp
+++ b/models/swig_decoders/ctc_decoders.cpp
@@ -7,8 +7,8 @@
 #include <map>
 #include <utility>
-#include "fst/fstlib.h"
 #include "ThreadPool.h"
+#include "fst/fstlib.h"
 #include "decoder_utils.h"
 #include "path_trie.h"

--- a/models/swig_decoders/ctc_decoders.h
+++ b/models/swig_decoders/ctc_decoders.h
--- a/models/swig_decoders/decoder_utils.cpp
+++ b/models/swig_decoders/decoder_utils.cpp
--- a/models/swig_decoders/decoder_utils.h
+++ b/models/swig_decoders/decoder_utils.h
--- a/models/swig_decoders/decoders.i
+++ b/models/swig_decoders/decoders.i
--- a/models/swig_decoders/path_trie.cpp
+++ b/models/swig_decoders/path_trie.cpp
--- a/models/swig_decoders/path_trie.h
+++ b/models/swig_decoders/path_trie.h
 #ifndef PATH_TRIE_H
 #define PATH_TRIE_H
 #pragma once
+#include <fst/fstlib.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
-#include <fst/fstlib.h>
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;

--- a/models/swig_decoders/scorer.cpp
+++ b/models/swig_decoders/scorer.cpp
--- a/models/swig_decoders/scorer.h
+++ b/models/swig_decoders/scorer.h
--- a/models/swig_decoders/setup.py
+++ b/models/swig_decoders/setup.py
--- a/models/swig_decoders/setup.sh
+++ b/models/swig_decoders/setup.sh
--- a/models/swig_decoders_wrapper.py
+++ b/models/swig_decoders_wrapper.py
--- a/deploy/demo_server.py
+++ b/deploy/demo_server.py
@@ -11,7 +11,7 @@ import wave
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.utils import read_manifest
 from utils.utility import add_arguments, print_arguments
@@ -46,7 +46,7 @@ add_arg('vocab_path',       str,
        'data/librispeech/eng_vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,

--- a/examples/librispeech/prepare_data.sh
+++ b/examples/librispeech/prepare_data.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
 # download data, generate manifests
 python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
+--target_dir='~/.cache/paddle/dataset/speech/Libri' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri'
+--full_download='True'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
-#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
+cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/librispeech/vocab.txt' \
+--manifest_paths='data/librispeech/manifest.train'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
 # compute mean and stddev for normalizer
@@ -30,3 +42,4 @@ fi
 echo "LibriSpeech Data preparation done."
+exit 0
--- a/examples/librispeech/generate.sh
+++ b/examples/librispeech/generate.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
 --num_samples=10 \
 --trainer_count=1 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
--num_proc_data=12 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -18,11 +27,19 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
+--infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_infer_golden.sh
+++ b/examples/librispeech/run_infer_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_test.sh
+++ b/examples/librispeech/run_test.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u test.py \
 --batch_size=128 \
 --trainer_count=8 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
--num_proc_data=12 \
+--num_proc_data=4 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -20,9 +30,17 @@ python -u test.py \
 --share_rnn_weights=True \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_test_golden.sh
+++ b/examples/librispeech/run_test_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_train.sh
+++ b/examples/librispeech/run_train.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u train.py \
--batch_size=256 \
+--batch_size=512 \
 --trainer_count=8 \
--num_passes=200 \
+--num_passes=50 \
 --num_proc_data=12 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
@@ -23,8 +25,16 @@ python -u train.py \
 --train_manifest='data/librispeech/manifest.train' \
 --dev_manifest='data/librispeech/manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--output_model_dir='./checkpoints' \
+--output_model_dir='./checkpoints/libri' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/run_tune.sh
+++ b/examples/librispeech/run_tune.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u tools/tune.py \
 --num_samples=100 \
@@ -23,8 +24,16 @@ python -u tools/tune.py \
 --share_rnn_weights=True \
 --tune_manifest='data/librispeech/manifest.dev-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/examples/mandarin/run_demo_client.sh
+++ b/examples/mandarin/run_demo_client.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# start demo client
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_client.py \
+--host_ip='localhost' \
+--host_port=8086 \
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo client!"
+    exit 1
+fi
+exit 0
--- a/examples/mandarin/run_demo_server.sh
+++ b/examples/mandarin/run_demo_server.sh
+#! /usr/bin/bash
+# TODO: replace the model with a mandarin model
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# start demo server
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_server.py \
+--host_ip='localhost' \
+--host_port=8086 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--speech_save_dir='demo_cache' \
+--warmup_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo server!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_data.sh
+++ b/examples/tiny/run_data.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# prepare folder
+if [ ! -e data/tiny ]; then
+    mkdir data/tiny
+fi
+# download data, generate manifests
+python data/librispeech/librispeech.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/libri' \
+--full_download='False'
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.dev'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.tiny' \
+--num_samples=64 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "Tiny data preparation done."
+exit 0
--- a/examples/tiny/run_infer.sh
+++ b/examples/tiny/run_infer.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_infer_golden.sh
+++ b/examples/tiny/run_infer_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_test.sh
+++ b/examples/tiny/run_test.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=16 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_test_golden.sh
+++ b/examples/tiny/run_test_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_train.sh
+++ b/examples/tiny/run_train.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python -u train.py \
+--batch_size=16 \
+--trainer_count=4 \
+--num_passes=20 \
+--num_proc_data=1 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=1e-5 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--output_model_dir='./checkpoints/tiny' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Fail to do inference!"
+    exit 1
+fi
+exit 0
--- a/examples/tiny/run_tune.sh
+++ b/examples/tiny/run_tune.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-9.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/infer.py
+++ b/infer.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@@ -35,13 +35,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('decoding_method',  str,

--- a/lm/run.sh
+++ b/lm/run.sh
-echo "Downloading language model ..."
-mkdir data
-LM=common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
-echo "Checking md5sum ..."
-md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
-if [ $MD5 != $md5_tmp ]; then
-    echo "Fail to download the language model!"
-    exit 1
-fi
--- a/models/__init__.py
+++ b/models/__init__.py
--- a/models/decoder.py
+++ b/models/decoder.py
@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)

--- a/lm/lm_scorer.py
+++ b/lm/lm_scorer.py
--- a/models/model.py
+++ b/models/model.py
@@ -8,10 +8,10 @@ import os
 import time
 import gzip
 import paddle.v2 as paddle
-from models.swig_decoders_wrapper import Scorer
+from decoders.swig_wrapper import Scorer
-from models.swig_decoders_wrapper import ctc_greedy_decoder
+from decoders.swig_wrapper import ctc_greedy_decoder
-from models.swig_decoders_wrapper import ctc_beam_search_decoder_batch
+from decoders.swig_wrapper import ctc_beam_search_decoder_batch
-from models.network import deep_speech_v2_network
+from model_utils.network import deep_speech_v2_network
 class DeepSpeech2Model(object):

--- a/models/network.py
+++ b/models/network.py
--- a/models/tests/test_decoders.py
+++ b/models/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-from models import decoder
+from model_utils import decoder
 class TestDecoders(unittest.TestCase):

--- a/models/librispeech/download_model.sh
+++ b/models/librispeech/download_model.sh
+#! /usr/bin/bash
+source ../../utils/utility.sh
+# TODO: add urls
+URL='to-be-added'
+MD5=5b4af224b26c1dc4dd972b7d32f2f52a
+TARGET=./librispeech_model.tar.gz
+echo "Download LibriSpeech model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download LibriSpeech model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+exit 0
--- a/models/lm/download_lm_en.sh
+++ b/models/lm/download_lm_en.sh
+#! /usr/bin/bash
+source ../../utils/utility.sh
+URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=./common_crawl_00.prune01111.trie.klm
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+exit 0
--- a/models/swig_decoders/__init__.py
+++ b/models/swig_decoders/__init__.py
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@@ -36,14 +36,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',

--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        'datasets/vocab/zh_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath to write the vocabulary.")
 add_arg('manifest_paths',   str,
        None,
@@ -34,7 +34,7 @@ args = parser.parse_args()
 def count_manifest(counter, manifest_path):
-    manifest_jsons = utils.read_manifest(manifest_path)
+    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
        for char in line_json['text']:
            counter.update(char)

--- a/tools/compute_mean_std.py
+++ b/tools/compute_mean_std.py
@@ -20,10 +20,10 @@ add_arg('specgram_type',    str,
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 add_arg('manifest_path',    str,
-        'datasets/manifest.train',
+        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
 add_arg('output_path',    str,
-        'mean_std.npz',
+        'data/librispeech/mean_std.npz',
        "Filepath of write mean and stddev to (.npz).")
 # yapf: disable
 args = parser.parse_args()

--- a/tools/tune.py
+++ b/tools/tune.py
@@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments
@@ -41,13 +41,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('error_rate_type',  str,

--- a/train.py
+++ b/train.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments
@@ -41,14 +41,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
        None,
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('output_model_dir', str,
-        "./checkpoints",
+        "./checkpoints/libri",
        "Directory for saving checkpoints.")
 add_arg('augment_conf_path',str,
        'conf/augmentation.config',

--- a/utils/utility.sh
+++ b/utils/utility.sh
+download() {
+    URL=$1
+    MD5=$2
+    TARGET=$3
+    if [ -e $TARGET ]; then
+        md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+        if [ $MD5 == $md5_result ]; then
+            echo "$TARGET already exists, download skipped."
+            return 0
+        fi
+    fi
+    wget -c $URL -P `dirname "$TARGET"`
+    md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+    if [ $MD5 -ne $md5_result ]; then
+        echo "Fail to download the language model!"
+        return 1
+    fi
+}