Update examples scripts and REAME.md for DS2.

f329ecda · Xinghai Sun · d28ee3fc · f329ecda · d28ee3fc · f329ecda
24 changed file
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
--- a/deep_speech_2/data/librispeech/eng_vocab.txt
+++ b/deep_speech_2/data/librispeech/eng_vocab.txt
-'
- 
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
--- a/deep_speech_2/data/librispeech/librispeech.py
+++ b/deep_speech_2/data/librispeech/librispeech.py
@@ -19,8 +19,6 @@ import json
 import codecs
 from paddle.v2.dataset.common import md5file

-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/libri",
+    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@@ -60,14 +58,14 @@ args = parser.parse_args()


 def download(url, md5sum, target_dir):
-    """
-    Download file from url to target_dir, and check md5sum.
+    """Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
-        os.system("wget -c " + url + " -P " + target_dir)
+        ret = os.system("wget -c " + url + " -P " + target_dir)
+        print(ret)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
@@ -77,8 +75,7 @@ def download(url, md5sum, target_dir):


 def unpack(filepath, target_dir):
-    """
-    Unpack the file to the target_dir.
+    """Unpack the file to the target_dir.
    """
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
@@ -87,8 +84,7 @@ def unpack(filepath, target_dir):


 def create_manifest(data_dir, manifest_path):
-    """
-    Create a manifest json file summarizing the data set, with each line
+    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
@@ -119,8 +115,7 @@ def create_manifest(data_dir, manifest_path):


 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """
-    Download, unpack and create summmary manifest file.
+    """Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
@@ -135,6 +130,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):


 def main():
+    args.target_dir = os.path.expanduser(args.target_dir)
+
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
@@ -145,12 +142,12 @@ def main():
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
-    prepare_dataset(
-        url=URL_TRAIN_CLEAN_100,
-        md5sum=MD5_TRAIN_CLEAN_100,
-        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,

--- a/deep_speech_2/deploy/demo_server.py
+++ b/deep_speech_2/deploy/demo_server.py
@@ -11,7 +11,7 @@ import wave
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.utils import read_manifest
 from utils.utility import add_arguments, print_arguments


--- a/deep_speech_2/examples/librispeech/prepare_data.sh
+++ b/deep_speech_2/examples/librispeech/prepare_data.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

 # download data, generate manifests
 python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
--target_dir='~/.cache/paddle/dataset/speech/Libri'
+--target_dir='~/.cache/paddle/dataset/speech/Libri' \
+--full_download='True'

 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi

-cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
+cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train


-# build vocabulary (can be skipped for English, as already provided)
-# python tools/build_vocab.py \
-# --count_threshold=0 \
-# --vocab_path='data/librispeech/eng_vocab.txt' \
-# --manifest_paths='data/librispeech/manifeset.train'
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/librispeech/vocab.txt' \
+--manifest_paths='data/librispeech/manifest.train'
+
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi


 # compute mean and stddev for normalizer
@@ -37,3 +42,4 @@ fi


 echo "LibriSpeech Data preparation done."
+exit 0
--- a/deep_speech_2/examples/librispeech/run_infer.sh
+++ b/deep_speech_2/examples/librispeech/run_infer.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
 --num_samples=10 \
 --trainer_count=1 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -17,11 +27,19 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
+--infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/librispeech/run_infer_golden.sh
+++ b/deep_speech_2/examples/librispeech/run_infer_golden.sh
+#! /usr/bin/bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/librispeech/run_test.sh
+++ b/deep_speech_2/examples/librispeech/run_test.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python -u evaluate.py \
+python -u test.py \
 --batch_size=128 \
 --trainer_count=8 \
 --beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -20,9 +30,17 @@ python -u evaluate.py \
 --share_rnn_weights=True \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/librispeech/run_test_golden.sh
+++ b/deep_speech_2/examples/librispeech/run_test_golden.sh
+#! /usr/bin/bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/librispeech/run_train.sh
+++ b/deep_speech_2/examples/librispeech/run_train.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# train model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u train.py \
--batch_size=256 \
+--batch_size=512 \
 --trainer_count=8 \
 --num_passes=50 \
 --num_proc_data=12 \
@@ -23,8 +24,16 @@ python -u train.py \
 --train_manifest='data/librispeech/manifest.train' \
 --dev_manifest='data/librispeech/manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
+--vocab_path='data/librispeech/vocab.txt' \
+--output_model_dir='./checkpoints/libri' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/librispeech/run_tune.sh
+++ b/deep_speech_2/examples/librispeech/run_tune.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u tools/tune.py \
 --num_samples=100 \
@@ -23,8 +24,16 @@ python -u tools/tune.py \
 --share_rnn_weights=True \
 --tune_manifest='data/librispeech/manifest.dev-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--vocab_path='data/librispeech/vocab.txt' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/mandarin/run_demo_client.sh
+++ b/deep_speech_2/examples/mandarin/run_demo_client.sh
+#! /usr/bin/bash
+
+pushd ../.. > /dev/null
+
+# start demo client
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_client.py \
+--host_ip='localhost' \
+--host_port=8086 \
+
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo client!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/mandarin/run_demo_server.sh
+++ b/deep_speech_2/examples/mandarin/run_demo_server.sh
+#! /usr/bin/bash
+# TODO: replace the model with a mandarin model
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# start demo server
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_server.py \
+--host_ip='localhost' \
+--host_port=8086 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--speech_save_dir='demo_cache' \
+--warmup_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo server!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_data.sh
+++ b/deep_speech_2/examples/tiny/run_data.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

 # download data, generate manifests
-python data/tiny/tiny.py \
+python data/librispeech/librispeech.py \
 --manifest_prefix='data/tiny/manifest' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/tiny'
+--target_dir='~/.cache/paddle/dataset/speech/libri' \
+--full_download='False'

 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi

-cat data/tiny/manifest.dev-clean | head -n 32 > data/tiny/manifest.train
-cat data/tiny/manifest.dev-clean | head -n 48 | tail -n 16 > data/tiny/manifest.dev
-cat data/tiny/manifest.dev-clean | head -n 64 | tail -n 16 > data/tiny/manifest.test
+head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny


 # build vocabulary
 python tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/tiny/vocab.txt' \
--manifest_paths='data/tiny/manifest.train'
+--manifest_paths='data/tiny/manifest.dev'

 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
@@ -31,8 +30,8 @@ fi

 # compute mean and stddev for normalizer
 python tools/compute_mean_std.py \
--manifest_path='data/tiny/manifest.train' \
--num_samples=32 \
+--manifest_path='data/tiny/manifest.tiny' \
+--num_samples=64 \
 --specgram_type='linear' \
 --output_path='data/tiny/mean_std.npz'

@@ -43,3 +42,4 @@ fi


 echo "Tiny data preparation done."
+exit 0
--- a/deep_speech_2/examples/tiny/run_infer.sh
+++ b/deep_speech_2/examples/tiny/run_infer.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
--num_samples=4 \
+--num_samples=10 \
 --trainer_count=1 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -17,11 +27,19 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.train' \
+--infer_manifest='data/tiny/manifest.tiny' \
 --mean_std_path='data/tiny/mean_std.npz' \
 --vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.pass-14.tar.gz' \
+--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
 --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_infer_golden.sh
+++ b/deep_speech_2/examples/tiny/run_infer_golden.sh
+#! /usr/bin/bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_test.sh
+++ b/deep_speech_2/examples/tiny/run_test.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python -u evaluate.py \
--batch_size=128 \
+python -u test.py \
+--batch_size=16 \
 --trainer_count=8 \
 --beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -18,11 +28,19 @@ python -u evaluate.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--test_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_test_golden.sh
+++ b/deep_speech_2/examples/tiny/run_test_golden.sh
+#! /usr/bin/bash
+
+pushd ../.. > /dev/null
+
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+
+
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_train.sh
+++ b/deep_speech_2/examples/tiny/run_train.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

-CUDA_VISIBLE_DEVICES=0,1 \
+# train model
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python -u train.py \
--batch_size=2 \
--trainer_count=1 \
--num_passes=10 \
+--batch_size=16 \
+--trainer_count=4 \
+--num_passes=20 \
 --num_proc_data=1 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --num_iter_print=100 \
--learning_rate=5e-5 \
+--learning_rate=1e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --use_sortagrad=True \
@@ -20,11 +21,19 @@ python -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
--train_manifest='data/tiny/manifest.train' \
--dev_manifest='data/tiny/manifest.train' \
+--train_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/tiny/manifest.tiny' \
 --mean_std_path='data/tiny/mean_std.npz' \
 --vocab_path='data/tiny/vocab.txt' \
--output_model_dir='./checkpoints' \
+--output_model_dir='./checkpoints/tiny' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped'
+
+if [ $? -ne 0 ]; then
+    echo "Fail to do inference!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/examples/tiny/run_tune.sh
+++ b/deep_speech_2/examples/tiny/run_tune.sh
 #! /usr/bin/bash

-pushd ../..
+pushd ../.. > /dev/null

+# grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u tools/tune.py \
 --num_samples=100 \
@@ -21,10 +22,18 @@ python -u tools/tune.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--tune_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--tune_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-9.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/models/librispeech/download_model.sh
+++ b/deep_speech_2/models/librispeech/download_model.sh
+#! /usr/bin/bash
+
+source ../../utils/utility.sh
+
+# TODO: add urls
+URL='to-be-added'
+MD5=5b4af224b26c1dc4dd972b7d32f2f52a
+TARGET=./librispeech_model.tar.gz
+
+
+echo "Download LibriSpeech model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download LibriSpeech model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+
+
+exit 0
--- a/deep_speech_2/models/lm/download_en.sh
+++ b/deep_speech_2/models/lm/download_en.sh
-echo "Downloading language model ..."
-
-mkdir data
-
-LM=common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-
-wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
-
-echo "Checking md5sum ..."
-md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
-
-if [ $MD5 != $md5_tmp ]; then
-    echo "Fail to download the language model!"
-    exit 1
-fi
--- a/deep_speech_2/models/lm/download_lm_en.sh
+++ b/deep_speech_2/models/lm/download_lm_en.sh
+#! /usr/bin/bash
+
+source ../../utils/utility.sh
+
+URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=./common_crawl_00.prune01111.trie.klm
+
+
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+
+
+exit 0
--- a/deep_speech_2/utils/utility.sh
+++ b/deep_speech_2/utils/utility.sh
+download() {
+    URL=$1
+    MD5=$2
+    TARGET=$3
+
+    if [ -e $TARGET ]; then
+        md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+        if [ $MD5 == $md5_result ]; then
+            echo "$TARGET already exists, download skipped."
+            return 0
+        fi
+    fi
+
+    wget -c $URL -P `dirname "$TARGET"`
+    md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+    if [ $MD5 == $md5_result ]; then
+        echo "Fail to download the language model!"
+        return 1
+    fi
+}