adapt to the new structure

c4bc8228 · Yibing Liu · 41e9e59d · 848bb8ab · c4bc8228 · 41e9e59d
63 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ addons:
      - python-pip
      - python2.7-dev
 before_install:
-  -  pip install -U virtualenv pre-commit pip
+  -  sudo pip install -U virtualenv pre-commit pip
  -  docker pull paddlepaddle/paddle:latest
 script:
  -  .travis/precommit.sh

--- a/deep_speech_2/.gitignore
+++ b/deep_speech_2/.gitignore
-manifest*
-mean_std.npz
-thirdparty/
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
--- a/deep_speech_2/data/librispeech/eng_vocab.txt
+++ b/deep_speech_2/data/librispeech/eng_vocab.txt
-'
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
--- a/deep_speech_2/data/librispeech/librispeech.py
+++ b/deep_speech_2/data/librispeech/librispeech.py
@@ -19,8 +19,6 @@ import json
 import codecs
 from paddle.v2.dataset.common import md5file
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@@ -60,8 +58,7 @@ args = parser.parse_args()
 def download(url, md5sum, target_dir):
-    """
+    """Download file from url to target_dir, and check md5sum.
-    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir):
 def unpack(filepath, target_dir):
-    """
+    """Unpack the file to the target_dir.
-    Unpack the file to the target_dir.
    """
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
@@ -87,8 +83,7 @@ def unpack(filepath, target_dir):
 def create_manifest(data_dir, manifest_path):
-    """
+    """Create a manifest json file summarizing the data set, with each line
-    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path):
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """
+    """Download, unpack and create summmary manifest file.
-    Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
 def main():
+    args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
@@ -145,12 +141,12 @@ def main():
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
-    prepare_dataset(
-        url=URL_TRAIN_CLEAN_100,
-        md5sum=MD5_TRAIN_CLEAN_100,
-        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,

--- a/deep_speech_2/lm/__init__.py
+++ b/deep_speech_2/lm/__init__.py
--- a/deep_speech_2/models/swig_decoders/_init_paths.py
+++ b/deep_speech_2/models/swig_decoders/_init_paths.py
--- a/deep_speech_2/models/swig_decoders/ctc_decoders.cpp
+++ b/deep_speech_2/models/swig_decoders/ctc_decoders.cpp
@@ -7,8 +7,8 @@
 #include <map>
 #include <utility>
-#include "fst/fstlib.h"
 #include "ThreadPool.h"
+#include "fst/fstlib.h"
 #include "decoder_utils.h"
 #include "path_trie.h"

--- a/deep_speech_2/models/swig_decoders/ctc_decoders.h
+++ b/deep_speech_2/models/swig_decoders/ctc_decoders.h
--- a/deep_speech_2/models/swig_decoders/decoder_utils.cpp
+++ b/deep_speech_2/models/swig_decoders/decoder_utils.cpp
--- a/deep_speech_2/models/swig_decoders/decoder_utils.h
+++ b/deep_speech_2/models/swig_decoders/decoder_utils.h
--- a/deep_speech_2/models/swig_decoders/decoders.i
+++ b/deep_speech_2/models/swig_decoders/decoders.i
--- a/deep_speech_2/models/swig_decoders/path_trie.cpp
+++ b/deep_speech_2/models/swig_decoders/path_trie.cpp
--- a/deep_speech_2/models/swig_decoders/path_trie.h
+++ b/deep_speech_2/models/swig_decoders/path_trie.h
 #ifndef PATH_TRIE_H
 #define PATH_TRIE_H
 #pragma once
+#include <fst/fstlib.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
-#include <fst/fstlib.h>
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;

--- a/deep_speech_2/models/swig_decoders/scorer.cpp
+++ b/deep_speech_2/models/swig_decoders/scorer.cpp
--- a/deep_speech_2/models/swig_decoders/scorer.h
+++ b/deep_speech_2/models/swig_decoders/scorer.h
--- a/deep_speech_2/models/swig_decoders/setup.py
+++ b/deep_speech_2/models/swig_decoders/setup.py
--- a/deep_speech_2/models/swig_decoders/setup.sh
+++ b/deep_speech_2/models/swig_decoders/setup.sh
--- a/deep_speech_2/models/swig_decoders_wrapper.py
+++ b/deep_speech_2/models/swig_decoders_wrapper.py
--- a/deep_speech_2/deploy/demo_server.py
+++ b/deep_speech_2/deploy/demo_server.py
@@ -11,7 +11,7 @@ import wave
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.utils import read_manifest
 from utils.utility import add_arguments, print_arguments
@@ -46,7 +46,7 @@ add_arg('vocab_path',       str,
        'data/librispeech/eng_vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,

--- a/deep_speech_2/examples/librispeech/prepare_data.sh
+++ b/deep_speech_2/examples/librispeech/prepare_data.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
 # download data, generate manifests
 python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
+--target_dir='~/.cache/paddle/dataset/speech/Libri' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri'
+--full_download='True'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
-#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
+cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/librispeech/vocab.txt' \
+--manifest_paths='data/librispeech/manifest.train'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
 # compute mean and stddev for normalizer
@@ -30,3 +42,4 @@ fi
 echo "LibriSpeech Data preparation done."
+exit 0
--- a/deep_speech_2/examples/librispeech/generate.sh
+++ b/deep_speech_2/examples/librispeech/generate.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
 --num_samples=10 \
 --trainer_count=1 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
--num_proc_data=12 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -18,11 +27,19 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
+--infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/librispeech/run_infer_golden.sh
+++ b/deep_speech_2/examples/librispeech/run_infer_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=2.15 \
+--beta=0.35 \
+--cutoff_prob=1.0 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/librispeech/run_test.sh
+++ b/deep_speech_2/examples/librispeech/run_test.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u test.py \
 --batch_size=128 \
 --trainer_count=8 \
 --beam_size=500 \
--num_proc_bsearch=12 \
+--num_proc_bsearch=8 \
--num_proc_data=12 \
+--num_proc_data=4 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
@@ -20,9 +30,17 @@ python -u test.py \
 --share_rnn_weights=True \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/librispeech/run_test_golden.sh
+++ b/deep_speech_2/examples/librispeech/run_test_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/librispeech/run_train.sh
+++ b/deep_speech_2/examples/librispeech/run_train.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u train.py \
--batch_size=256 \
+--batch_size=512 \
 --trainer_count=8 \
--num_passes=200 \
+--num_passes=50 \
 --num_proc_data=12 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
@@ -23,8 +25,16 @@ python -u train.py \
 --train_manifest='data/librispeech/manifest.train' \
 --dev_manifest='data/librispeech/manifest.dev' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--output_model_dir='./checkpoints' \
+--output_model_dir='./checkpoints/libri' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/librispeech/run_tune.sh
+++ b/deep_speech_2/examples/librispeech/run_tune.sh
 #! /usr/bin/bash
-pushd ../..
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python -u tools/tune.py \
 --num_samples=100 \
@@ -23,8 +24,16 @@ python -u tools/tune.py \
 --share_rnn_weights=True \
 --tune_manifest='data/librispeech/manifest.dev-clean' \
 --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --error_rate_type='wer' \
 --specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/mandarin/run_demo_client.sh
+++ b/deep_speech_2/examples/mandarin/run_demo_client.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# start demo client
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_client.py \
+--host_ip='localhost' \
+--host_port=8086 \
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo client!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/mandarin/run_demo_server.sh
+++ b/deep_speech_2/examples/mandarin/run_demo_server.sh
+#! /usr/bin/bash
+# TODO: replace the model with a mandarin model
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# start demo server
+CUDA_VISIBLE_DEVICES=0 \
+python -u deploy/demo_server.py \
+--host_ip='localhost' \
+--host_port=8086 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--speech_save_dir='demo_cache' \
+--warmup_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in starting demo server!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_data.sh
+++ b/deep_speech_2/examples/tiny/run_data.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# prepare folder
+if [ ! -e data/tiny ]; then
+    mkdir data/tiny
+fi
+# download data, generate manifests
+python data/librispeech/librispeech.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/libri' \
+--full_download='False'
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.dev'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.tiny' \
+--num_samples=64 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "Tiny data preparation done."
+exit 0
--- a/deep_speech_2/examples/tiny/run_infer.sh
+++ b/deep_speech_2/examples/tiny/run_infer.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_infer_golden.sh
+++ b/deep_speech_2/examples/tiny/run_infer_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# infer
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in inference!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_test.sh
+++ b/deep_speech_2/examples/tiny/run_test.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=16 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-19.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_test_golden.sh
+++ b/deep_speech_2/examples/tiny/run_test_golden.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# download language model
+pushd models/lm > /dev/null
+sh download_lm_en.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# download well-trained model
+pushd models/librispeech > /dev/null
+sh download_model.sh
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+popd > /dev/null
+# evaluate model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u test.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=8 \
+--num_proc_data=4 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/tiny/manifest.test-clean' \
+--mean_std_path='models/librispeech/mean_std.npz' \
+--vocab_path='models/librispeech/vocab.txt' \
+--model_path='models/librispeech/params.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_train.sh
+++ b/deep_speech_2/examples/tiny/run_train.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# train model
+# if you wish to resume from an exists model, uncomment --init_model_path
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python -u train.py \
+--batch_size=16 \
+--trainer_count=4 \
+--num_passes=20 \
+--num_proc_data=1 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=1e-5 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--output_model_dir='./checkpoints/tiny' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
+if [ $? -ne 0 ]; then
+    echo "Fail to do inference!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/examples/tiny/run_tune.sh
+++ b/deep_speech_2/examples/tiny/run_tune.sh
+#! /usr/bin/bash
+pushd ../.. > /dev/null
+# grid-search for hyper-parameters in language model
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/tiny/manifest.tiny' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-9.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
+if [ $? -ne 0 ]; then
+    echo "Failed in tuning!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@@ -35,13 +35,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('decoding_method',  str,

--- a/deep_speech_2/lm/run.sh
+++ b/deep_speech_2/lm/run.sh
-echo "Downloading language model ..."
-mkdir data
-LM=common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
-echo "Checking md5sum ..."
-md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
-if [ $MD5 != $md5_tmp ]; then
-    echo "Fail to download the language model!"
-    exit 1
-fi
--- a/deep_speech_2/models/__init__.py
+++ b/deep_speech_2/models/__init__.py
--- a/deep_speech_2/models/decoder.py
+++ b/deep_speech_2/models/decoder.py
@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)

--- a/deep_speech_2/lm/lm_scorer.py
+++ b/deep_speech_2/lm/lm_scorer.py
--- a/deep_speech_2/models/model.py
+++ b/deep_speech_2/models/model.py
@@ -8,10 +8,10 @@ import os
 import time
 import gzip
 import paddle.v2 as paddle
-from models.swig_decoders_wrapper import Scorer
+from decoders.swig_wrapper import Scorer
-from models.swig_decoders_wrapper import ctc_greedy_decoder
+from decoders.swig_wrapper import ctc_greedy_decoder
-from models.swig_decoders_wrapper import ctc_beam_search_decoder_batch
+from decoders.swig_wrapper import ctc_beam_search_decoder_batch
-from models.network import deep_speech_v2_network
+from model_utils.network import deep_speech_v2_network
 class DeepSpeech2Model(object):

--- a/deep_speech_2/models/network.py
+++ b/deep_speech_2/models/network.py
--- a/deep_speech_2/models/tests/test_decoders.py
+++ b/deep_speech_2/models/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-from models import decoder
+from model_utils import decoder
 class TestDecoders(unittest.TestCase):

--- a/deep_speech_2/models/librispeech/download_model.sh
+++ b/deep_speech_2/models/librispeech/download_model.sh
+#! /usr/bin/bash
+source ../../utils/utility.sh
+# TODO: add urls
+URL='to-be-added'
+MD5=5b4af224b26c1dc4dd972b7d32f2f52a
+TARGET=./librispeech_model.tar.gz
+echo "Download LibriSpeech model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download LibriSpeech model!"
+    exit 1
+fi
+tar -zxvf $TARGET
+exit 0
--- a/deep_speech_2/models/lm/download_lm_en.sh
+++ b/deep_speech_2/models/lm/download_lm_en.sh
+#! /usr/bin/bash
+source ../../utils/utility.sh
+URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=./common_crawl_00.prune01111.trie.klm
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+exit 0
--- a/deep_speech_2/models/swig_decoders/__init__.py
+++ b/deep_speech_2/models/swig_decoders/__init__.py
--- a/deep_speech_2/test.py
+++ b/deep_speech_2/test.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@@ -36,14 +36,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',

--- a/deep_speech_2/tools/build_vocab.py
+++ b/deep_speech_2/tools/build_vocab.py
@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        'datasets/vocab/zh_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath to write the vocabulary.")
 add_arg('manifest_paths',   str,
        None,
@@ -34,7 +34,7 @@ args = parser.parse_args()
 def count_manifest(counter, manifest_path):
-    manifest_jsons = utils.read_manifest(manifest_path)
+    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
        for char in line_json['text']:
            counter.update(char)

--- a/deep_speech_2/tools/compute_mean_std.py
+++ b/deep_speech_2/tools/compute_mean_std.py
@@ -20,10 +20,10 @@ add_arg('specgram_type',    str,
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 add_arg('manifest_path',    str,
-        'datasets/manifest.train',
+        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
 add_arg('output_path',    str,
-        'mean_std.npz',
+        'data/librispeech/mean_std.npz',
        "Filepath of write mean and stddev to (.npz).")
 # yapf: disable
 args = parser.parse_args()

--- a/deep_speech_2/tools/tune.py
+++ b/deep_speech_2/tools/tune.py
@@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments
@@ -41,13 +41,13 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
-        './checkpoints/params.latest.tar.gz',
+        './checkpoints/libri/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('error_rate_type',  str,

--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments
@@ -41,14 +41,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
        None,
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('output_model_dir', str,
-        "./checkpoints",
+        "./checkpoints/libri",
        "Directory for saving checkpoints.")
 add_arg('augment_conf_path',str,
        'conf/augmentation.config',

--- a/deep_speech_2/utils/utility.sh
+++ b/deep_speech_2/utils/utility.sh
+download() {
+    URL=$1
+    MD5=$2
+    TARGET=$3
+    if [ -e $TARGET ]; then
+        md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+        if [ $MD5 == $md5_result ]; then
+            echo "$TARGET already exists, download skipped."
+            return 0
+        fi
+    fi
+    wget -c $URL -P `dirname "$TARGET"`
+    md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
+    if [ $MD5 -ne $md5_result ]; then
+        echo "Fail to download the language model!"
+        return 1
+    fi
+}
--- a/mt_with_external_memory/README.md
+++ b/mt_with_external_memory/README.md
--- a/mt_with_external_memory/data_utils.py
+++ b/mt_with_external_memory/data_utils.py
+"""
+    Contains data utilities.
+"""
+def reader_append_wrapper(reader, append_tuple):
+    """
+    Data reader wrapper for appending extra data to exisiting reader.
+    """
+    def new_reader():
+        for ins in reader():
+            yield ins + append_tuple
+    return new_reader
--- a/mt_with_external_memory/external_memory.py
+++ b/mt_with_external_memory/external_memory.py
+"""
+    External neural memory class.
+"""
+import paddle.v2 as paddle
+class ExternalMemory(object):
+    """External neural memory class.
+    A simplified Neural Turing Machines (NTM) with only content-based
+    addressing (including content addressing and interpolation, but excluding
+    convolutional shift and sharpening). It serves as an external differential
+    memory bank, with differential write/read head controllers to store
+    and read information dynamically. Simple feedforward networks are
+    used as the write/read head controllers.
+    The ExternalMemory class could be utilized by many neural network structures
+    to easily expand their memory bandwidth and accomplish a long-term memory
+    handling. Besides, some existing mechanism can be realized directly with
+    the ExternalMemory class, e.g. the attention mechanism in Seq2Seq (i.e. an
+    unbounded external memory).
+    Besides, the ExternalMemory class must be used together with
+    paddle.layer.recurrent_group (within its step function). It can never be
+    used in a standalone manner.
+    For more details, please refer to
+    `Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
+    :param name: Memory name.
+    :type name: basestring
+    :param mem_slot_size: Size of memory slot/vector.
+    :type mem_slot_size: int
+    :param boot_layer: Boot layer for initializing the external memory. The
+                       sequence layer has sequence length indicating the number
+                       of memory slots, and size as memory slot size.
+    :type boot_layer: LayerOutput
+    :param readonly: If true, the memory is read-only, and write function cannot
+                     be called. Default is false.
+    :type readonly: bool
+    :param enable_interpolation: If set true, the read/write addressing weights
+                                 will be interpolated with the weights in the
+                                 last step, with the affine coefficients being
+                                 a learnable gate function.
+    :type enable_interpolation: bool
+    """
+    def __init__(self,
+                 name,
+                 mem_slot_size,
+                 boot_layer,
+                 readonly=False,
+                 enable_interpolation=True):
+        self.name = name
+        self.mem_slot_size = mem_slot_size
+        self.readonly = readonly
+        self.enable_interpolation = enable_interpolation
+        self.external_memory = paddle.layer.memory(
+            name=self.name, size=self.mem_slot_size, boot_layer=boot_layer)
+        # prepare a constant (zero) intializer for addressing weights 
+        self.zero_addressing_init = paddle.layer.slope_intercept(
+            input=paddle.layer.fc(input=boot_layer, size=1),
+            slope=0.0,
+            intercept=0.0)
+        # set memory to constant when readonly=True
+        if self.readonly:
+            self.updated_external_memory = paddle.layer.mixed(
+                name=self.name,
+                input=[
+                    paddle.layer.identity_projection(input=self.external_memory)
+                ],
+                size=self.mem_slot_size)
+    def _content_addressing(self, key_vector):
+        """Get write/read head's addressing weights via content-based addressing.
+        """
+        # content-based addressing: a=tanh(W*M + U*key)
+        key_projection = paddle.layer.fc(
+            input=key_vector,
+            size=self.mem_slot_size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        key_proj_expanded = paddle.layer.expand(
+            input=key_projection, expand_as=self.external_memory)
+        memory_projection = paddle.layer.fc(
+            input=self.external_memory,
+            size=self.mem_slot_size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        merged_projection = paddle.layer.addto(
+            input=[key_proj_expanded, memory_projection],
+            act=paddle.activation.Tanh())
+        # softmax addressing weight: w=softmax(v^T a)
+        addressing_weight = paddle.layer.fc(
+            input=merged_projection,
+            size=1,
+            act=paddle.activation.SequenceSoftmax(),
+            bias_attr=False)
+        return addressing_weight
+    def _interpolation(self, head_name, key_vector, addressing_weight):
+        """Interpolate between previous and current addressing weights.
+        """
+        # prepare interpolation scalar gate: g=sigmoid(W*key)
+        gate = paddle.layer.fc(
+            input=key_vector,
+            size=1,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=False)
+        # interpolation: w_t = g*w_t+(1-g)*w_{t-1}
+        last_addressing_weight = paddle.layer.memory(
+            name=self.name + "_addressing_weight_" + head_name,
+            size=1,
+            boot_layer=self.zero_addressing_init)
+        interpolated_weight = paddle.layer.interpolation(
+            name=self.name + "_addressing_weight_" + head_name,
+            input=[addressing_weight, addressing_weight],
+            weight=paddle.layer.expand(input=gate, expand_as=addressing_weight))
+        return interpolated_weight
+    def _get_addressing_weight(self, head_name, key_vector):
+        """Get final addressing weights for read/write heads, including content
+        addressing and interpolation.
+        """
+        # current content-based addressing
+        addressing_weight = self._content_addressing(key_vector)
+        # interpolation with previous addresing weight
+        if self.enable_interpolation:
+            return self._interpolation(head_name, key_vector, addressing_weight)
+        else:
+            return addressing_weight
+    def write(self, write_key):
+        """Write onto the external memory.
+        It cannot be called if "readonly" set True.
+        :param write_key: Key vector for write heads to generate writing
+                          content and addressing signals.
+        :type write_key: LayerOutput
+        """
+        # check readonly
+        if self.readonly:
+            raise ValueError("ExternalMemory with readonly=True cannot write.")
+        # get addressing weight for write head
+        write_weight = self._get_addressing_weight("write_head", write_key)
+        # prepare add_vector and erase_vector
+        erase_vector = paddle.layer.fc(
+            input=write_key,
+            size=self.mem_slot_size,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=False)
+        add_vector = paddle.layer.fc(
+            input=write_key,
+            size=self.mem_slot_size,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=False)
+        erase_vector_expand = paddle.layer.expand(
+            input=erase_vector, expand_as=self.external_memory)
+        add_vector_expand = paddle.layer.expand(
+            input=add_vector, expand_as=self.external_memory)
+        # prepare scaled add part and erase part
+        scaled_erase_vector_expand = paddle.layer.scaling(
+            weight=write_weight, input=erase_vector_expand)
+        erase_memory_part = paddle.layer.mixed(
+            input=paddle.layer.dotmul_operator(
+                a=self.external_memory,
+                b=scaled_erase_vector_expand,
+                scale=-1.0))
+        add_memory_part = paddle.layer.scaling(
+            weight=write_weight, input=add_vector_expand)
+        # update external memory
+        self.updated_external_memory = paddle.layer.addto(
+            input=[self.external_memory, add_memory_part, erase_memory_part],
+            name=self.name)
+    def read(self, read_key):
+        """Read from the external memory.
+        :param write_key: Key vector for read head to generate addressing
+                          signals.
+        :type write_key: LayerOutput
+        :return: Content (vector) read from external memory.
+        :rtype: LayerOutput
+        """
+        # get addressing weight for write head
+        read_weight = self._get_addressing_weight("read_head", read_key)
+        # read content from external memory
+        scaled = paddle.layer.scaling(
+            weight=read_weight, input=self.updated_external_memory)
+        return paddle.layer.pooling(
+            input=scaled, pooling_type=paddle.pooling.Sum())
--- a/mt_with_external_memory/image/lstm_c_state.png
+++ b/mt_with_external_memory/image/lstm_c_state.png
--- a/mt_with_external_memory/image/memory_enhanced_decoder.png
+++ b/mt_with_external_memory/image/memory_enhanced_decoder.png
--- a/mt_with_external_memory/image/neural_turing_machine_arch.png
+++ b/mt_with_external_memory/image/neural_turing_machine_arch.png
--- a/mt_with_external_memory/image/turing_machine_cartoon.gif
+++ b/mt_with_external_memory/image/turing_machine_cartoon.gif
--- a/mt_with_external_memory/infer.py
+++ b/mt_with_external_memory/infer.py
+"""
+    Contains infering script for machine translation with external memory.
+"""
+import distutils.util
+import argparse
+import gzip
+import paddle.v2 as paddle
+from external_memory import ExternalMemory
+from model import memory_enhanced_seq2seq
+from data_utils import reader_append_wrapper
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--dict_size",
+    default=30000,
+    type=int,
+    help="Vocabulary size. (default: %(default)s)")
+parser.add_argument(
+    "--word_vec_dim",
+    default=512,
+    type=int,
+    help="Word embedding size. (default: %(default)s)")
+parser.add_argument(
+    "--hidden_size",
+    default=1024,
+    type=int,
+    help="Hidden cell number in RNN. (default: %(default)s)")
+parser.add_argument(
+    "--memory_slot_num",
+    default=8,
+    type=int,
+    help="External memory slot number. (default: %(default)s)")
+parser.add_argument(
+    "--beam_size",
+    default=3,
+    type=int,
+    help="Beam search width. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=False,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--trainer_count",
+    default=1,
+    type=int,
+    help="Trainer number. (default: %(default)s)")
+parser.add_argument(
+    "--batch_size",
+    default=5,
+    type=int,
+    help="Batch size. (default: %(default)s)")
+parser.add_argument(
+    "--infer_data_num",
+    default=3,
+    type=int,
+    help="Instance num to infer. (default: %(default)s)")
+parser.add_argument(
+    "--model_filepath",
+    default="checkpoints/params.latest.tar.gz",
+    type=str,
+    help="Model filepath. (default: %(default)s)")
+parser.add_argument(
+    "--memory_perturb_stddev",
+    default=0.1,
+    type=float,
+    help="Memory perturb stddev for memory initialization."
+    "(default: %(default)s)")
+args = parser.parse_args()
+def parse_beam_search_result(beam_result, dictionary):
+    """
+    Beam search result parser.
+    """
+    sentence_list = []
+    sentence = []
+    for word in beam_result[1]:
+        if word != -1:
+            sentence.append(word)
+        else:
+            sentence_list.append(
+                ' '.join([dictionary.get(word) for word in sentence[1:]]))
+            sentence = []
+    beam_probs = beam_result[0]
+    beam_size = len(beam_probs[0])
+    beam_sentences = [
+        sentence_list[i:i + beam_size]
+        for i in range(0, len(sentence_list), beam_size)
+    ]
+    return beam_probs, beam_sentences
+def infer():
+    """
+    For inferencing.
+    """
+    # create network config
+    source_words = paddle.layer.data(
+        name="source_words",
+        type=paddle.data_type.integer_value_sequence(args.dict_size))
+    beam_gen = memory_enhanced_seq2seq(
+        encoder_input=source_words,
+        decoder_input=None,
+        decoder_target=None,
+        hidden_size=args.hidden_size,
+        word_vec_dim=args.word_vec_dim,
+        dict_size=args.dict_size,
+        is_generating=True,
+        beam_size=args.beam_size)
+    # load parameters
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(args.model_filepath))
+    # prepare infer data
+    infer_data = []
+    random.seed(0)  # for keeping consitancy for multiple runs
+    bounded_memory_perturbation = [[
+        random.gauss(0, memory_perturb_stddev) for i in xrange(args.hidden_size)
+    ] for j in xrange(args.memory_slot_num)]
+    test_append_reader = reader_append_wrapper(
+        reader=paddle.dataset.wmt14.test(dict_size),
+        append_tuple=(bounded_memory_perturbation, ))
+    for i, item in enumerate(test_append_reader()):
+        if i < args.infer_data_num:
+            infer_data.append((item[0], item[3], ))
+    # run inference
+    beam_result = paddle.infer(
+        output_layer=beam_gen,
+        parameters=parameters,
+        input=infer_data,
+        field=['prob', 'id'])
+    # parse beam result and print 
+    source_dict, target_dict = paddle.dataset.wmt14.get_dict(dict_size)
+    beam_probs, beam_sentences = parse_beam_search_result(beam_result,
+                                                          target_dict)
+    for i in xrange(args.infer_data_num):
+        print "\n***************************************************\n"
+        print "src:", ' '.join(
+            [source_dict.get(word) for word in infer_data[i][0]]), "\n"
+        for j in xrange(args.beam_size):
+            print "prob = %f : %s" % (beam_probs[i][j], beam_sentences[i][j])
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    infer()
+if __name__ == '__main__':
+    main()
--- a/mt_with_external_memory/model.py
+++ b/mt_with_external_memory/model.py
+""" 
+    Contains model configuration for external-memory-enhanced seq2seq.
+    The "external memory" refers to two types of memories.
+    - Unbounded memory: i.e. vanilla attention mechanism in Seq2Seq.
+    - Bounded memory: i.e. external memory in NTM.
+    Both types of external memories are exploited to enhance the vanilla
+    Seq2Seq neural machine translation.
+    The implementation primarily follows the paper
+    `Memory-enhanced Decoder for Neural Machine Translation
+    <https://arxiv.org/abs/1606.02003>`_,
+    with some minor differences (will be listed in README.md).
+    For details about "external memory", please also refer to
+    `Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
+"""
+import paddle.v2 as paddle
+from external_memory import ExternalMemory
+def bidirectional_gru_encoder(input, size, word_vec_dim):
+    """Bidirectional GRU encoder.
+    :params size: Hidden cell number in decoder rnn.
+    :type size: int
+    :params word_vec_dim: Word embedding size.
+    :type word_vec_dim: int
+    :return: Tuple of 1. concatenated forward and backward hidden sequence.
+             2. last state of backward rnn.
+    :rtype: tuple of LayerOutput
+    """
+    # token embedding
+    embeddings = paddle.layer.embedding(input=input, size=word_vec_dim)
+    # token-level forward and backard encoding for attentions
+    forward = paddle.networks.simple_gru(
+        input=embeddings, size=size, reverse=False)
+    backward = paddle.networks.simple_gru(
+        input=embeddings, size=size, reverse=True)
+    forward_backward = paddle.layer.concat(input=[forward, backward])
+    # sequence-level encoding
+    backward_first = paddle.layer.first_seq(input=backward)
+    return forward_backward, backward_first
+def memory_enhanced_decoder(input, target, initial_state, source_context, size,
+                            word_vec_dim, dict_size, is_generating, beam_size):
+    """GRU sequence decoder enhanced with external memory.
+    The "external memory" refers to two types of memories.
+    - Unbounded memory: i.e. attention mechanism in Seq2Seq.
+    - Bounded memory: i.e. external memory in NTM.
+    Both types of external memories can be implemented with
+    ExternalMemory class, and are both exploited in this enhanced RNN decoder.
+    The vanilla RNN/LSTM/GRU also has a narrow memory mechanism, namely the
+    hidden state vector (or cell state in LSTM) carrying information through
+    a span of sequence time, which is a successful design enriching the model
+    with the capability to "remember" things in the long run. However, such a
+    vector state is somewhat limited to a very narrow memory bandwidth. External
+    memory introduced here could easily increase the memory capacity with linear
+    complexity cost (rather than quadratic for vector state).
+    This enhanced decoder expands its "memory passage" through two
+    ExternalMemory objects:
+    - Bounded memory for handling long-term information exchange within decoder
+      itself. A direct expansion of traditional "vector" state.
+    - Unbounded memory for handling source language's token-wise information.
+      Exactly the attention mechanism over Seq2Seq.
+    Notice that we take the attention mechanism as a particular form of external
+    memory, with read-only memory bank initialized with encoder states, and a
+    read head with content-based addressing (attention). From this view point,
+    we arrive at a better understanding of attention mechanism itself and other
+    external memory, and a concise and unified implementation for them.
+    For more details about external memory, please refer to
+    `Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
+    For more details about this memory-enhanced decoder, please
+    refer to `Memory-enhanced Decoder for Neural Machine Translation 
+    <https://arxiv.org/abs/1606.02003>`_. This implementation is highly
+    correlated to this paper, but with minor differences (e.g. put "write"
+    before "read" to bypass a potential bug in V2 APIs. See
+    (`issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_).
+    :params input: Decoder input.
+    :type input: LayerOutput
+    :params target: Decoder target.
+    :type target: LayerOutput
+    :params initial_state: Initial hidden state.
+    :type initial_state: LayerOutput
+    :params source_context: Group of context hidden states for each token in the
+                            source sentence, for attention mechanisim.
+    :type source_context: LayerOutput
+    :params size: Hidden cell number in decoder rnn.
+    :type size: int
+    :params word_vec_dim: Word embedding size.
+    :type word_vec_dim: int
+    :param dict_size: Vocabulary size.
+    :type dict_size: int
+    :params is_generating: Whether for beam search inferencing (True) or
+                           for training (False).
+    :type is_generating: bool
+    :params beam_size: Beam search width.
+    :type beam_size: int
+    :return: Cost layer if is_generating=False; Beam search layer if
+             is_generating = True.
+    :rtype: LayerOutput
+    """
+    # prepare initial bounded and unbounded memory
+    bounded_memory_slot_init = paddle.layer.fc(
+        input=paddle.layer.pooling(
+            input=source_context, pooling_type=paddle.pooling.Avg()),
+        size=size,
+        act=paddle.activation.Sigmoid())
+    bounded_memory_perturbation = paddle.layer.data(
+        name='bounded_memory_perturbation',
+        type=paddle.data_type.dense_vector_sequence(size))
+    bounded_memory_init = paddle.layer.addto(
+        input=[
+            paddle.layer.expand(
+                input=bounded_memory_slot_init,
+                expand_as=bounded_memory_perturbation),
+            bounded_memory_perturbation
+        ],
+        act=paddle.activation.Linear())
+    unbounded_memory_init = source_context
+    # prepare step function for reccurent group
+    def recurrent_decoder_step(cur_embedding):
+        # create hidden state, bounded and unbounded memory.
+        state = paddle.layer.memory(
+            name="gru_decoder", size=size, boot_layer=initial_state)
+        bounded_memory = ExternalMemory(
+            name="bounded_memory",
+            mem_slot_size=size,
+            boot_layer=bounded_memory_init,
+            readonly=False,
+            enable_interpolation=True)
+        unbounded_memory = ExternalMemory(
+            name="unbounded_memory",
+            mem_slot_size=size * 2,
+            boot_layer=unbounded_memory_init,
+            readonly=True,
+            enable_interpolation=False)
+        # write bounded memory
+        bounded_memory.write(state)
+        # read bounded memory
+        bounded_memory_read = bounded_memory.read(state)
+        # prepare key for unbounded memory
+        key_for_unbounded_memory = paddle.layer.fc(
+            input=[bounded_memory_read, cur_embedding],
+            size=size,
+            act=paddle.activation.Tanh(),
+            bias_attr=False)
+        # read unbounded memory (i.e. attention mechanism) 
+        context = unbounded_memory.read(key_for_unbounded_memory)
+        # gated recurrent unit
+        gru_inputs = paddle.layer.fc(
+            input=[context, cur_embedding, bounded_memory_read],
+            size=size * 3,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        gru_output = paddle.layer.gru_step(
+            name="gru_decoder", input=gru_inputs, output_mem=state, size=size)
+        # step output
+        return paddle.layer.fc(
+            input=[gru_output, context, cur_embedding],
+            size=dict_size,
+            act=paddle.activation.Softmax(),
+            bias_attr=True)
+    if not is_generating:
+        target_embeddings = paddle.layer.embedding(
+            input=input,
+            size=word_vec_dim,
+            param_attr=paddle.attr.ParamAttr(name="_decoder_word_embedding"))
+        decoder_result = paddle.layer.recurrent_group(
+            name="decoder_group",
+            step=recurrent_decoder_step,
+            input=[target_embeddings])
+        cost = paddle.layer.classification_cost(
+            input=decoder_result, label=target)
+        return cost
+    else:
+        target_embeddings = paddle.layer.GeneratedInput(
+            size=dict_size,
+            embedding_name="_decoder_word_embedding",
+            embedding_size=word_vec_dim)
+        beam_gen = paddle.layer.beam_search(
+            name="decoder_group",
+            step=recurrent_decoder_step,
+            input=[target_embeddings],
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=100)
+        return beam_gen
+def memory_enhanced_seq2seq(encoder_input, decoder_input, decoder_target,
+                            hidden_size, word_vec_dim, dict_size, is_generating,
+                            beam_size):
+    """Seq2Seq Model enhanced with external memory.
+    The "external memory" refers to two types of memories.
+    - Unbounded memory: i.e. attention mechanism in Seq2Seq.
+    - Bounded memory: i.e. external memory in NTM.
+    Both types of external memories can be implemented with
+    ExternalMemory class, and are both exploited in this Seq2Seq model.
+    Please refer to the function comments of memory_enhanced_decoder(...).
+    For more details about external memory, please refer to
+    `Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
+    For more details about this memory-enhanced Seq2Seq, please
+    refer to `Memory-enhanced Decoder for Neural Machine Translation 
+    <https://arxiv.org/abs/1606.02003>`_.
+    :params encoder_input: Encoder input.
+    :type encoder_input: LayerOutput
+    :params decoder_input: Decoder input.
+    :type decoder_input: LayerOutput
+    :params decoder_target: Decoder target.
+    :type decoder_target: LayerOutput
+    :params hidden_size: Hidden cell number, both in encoder and decoder rnn.
+    :type hidden_size: int
+    :params word_vec_dim: Word embedding size.
+    :type word_vec_dim: int
+    :param dict_size: Vocabulary size.
+    :type dict_size: int
+    :params is_generating: Whether for beam search inferencing (True) or
+                           for training (False).
+    :type is_generating: bool
+    :params beam_size: Beam search width.
+    :type beam_size: int
+    :return: Cost layer if is_generating=False; Beam search layer if
+             is_generating = True.
+    :rtype: LayerOutput
+    """
+    # encoder
+    context_encodings, sequence_encoding = bidirectional_gru_encoder(
+        input=encoder_input, size=hidden_size, word_vec_dim=word_vec_dim)
+    # decoder
+    return memory_enhanced_decoder(
+        input=decoder_input,
+        target=decoder_target,
+        initial_state=sequence_encoding,
+        source_context=context_encodings,
+        size=hidden_size,
+        word_vec_dim=word_vec_dim,
+        dict_size=dict_size,
+        is_generating=is_generating,
+        beam_size=beam_size)
--- a/mt_with_external_memory/train.py
+++ b/mt_with_external_memory/train.py
+"""
+    Contains training script for machine translation with external memory.
+"""
+import argparse
+import sys
+import gzip
+import distutils.util
+import random
+import paddle.v2 as paddle
+from external_memory import ExternalMemory
+from model import memory_enhanced_seq2seq
+from data_utils import reader_append_wrapper
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--dict_size",
+    default=30000,
+    type=int,
+    help="Vocabulary size. (default: %(default)s)")
+parser.add_argument(
+    "--word_vec_dim",
+    default=512,
+    type=int,
+    help="Word embedding size. (default: %(default)s)")
+parser.add_argument(
+    "--hidden_size",
+    default=1024,
+    type=int,
+    help="Hidden cell number in RNN. (default: %(default)s)")
+parser.add_argument(
+    "--memory_slot_num",
+    default=8,
+    type=int,
+    help="External memory slot number. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=False,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--trainer_count",
+    default=1,
+    type=int,
+    help="Trainer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_passes",
+    default=100,
+    type=int,
+    help="Training epochs. (default: %(default)s)")
+parser.add_argument(
+    "--batch_size",
+    default=5,
+    type=int,
+    help="Batch size. (default: %(default)s)")
+parser.add_argument(
+    "--memory_perturb_stddev",
+    default=0.1,
+    type=float,
+    help="Memory perturb stddev for memory initialization."
+    "(default: %(default)s)")
+args = parser.parse_args()
+def train():
+    """
+    For training.
+    """
+    # create optimizer
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=5e-5,
+        gradient_clipping_threshold=5,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    # create network config
+    source_words = paddle.layer.data(
+        name="source_words",
+        type=paddle.data_type.integer_value_sequence(args.dict_size))
+    target_words = paddle.layer.data(
+        name="target_words",
+        type=paddle.data_type.integer_value_sequence(args.dict_size))
+    target_next_words = paddle.layer.data(
+        name='target_next_words',
+        type=paddle.data_type.integer_value_sequence(args.dict_size))
+    cost = memory_enhanced_seq2seq(
+        encoder_input=source_words,
+        decoder_input=target_words,
+        decoder_target=target_next_words,
+        hidden_size=args.hidden_size,
+        word_vec_dim=args.word_vec_dim,
+        dict_size=args.dict_size,
+        is_generating=False,
+        beam_size=None)
+    # create parameters and trainer
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+    # create data readers
+    feeding = {
+        "source_words": 0,
+        "target_words": 1,
+        "target_next_words": 2,
+        "bounded_memory_perturbation": 3
+    }
+    random.seed(0)  # for keeping consitancy for multiple runs
+    bounded_memory_perturbation = [[
+        random.gauss(0, args.memory_perturb_stddev)
+        for i in xrange(args.hidden_size)
+    ] for j in xrange(args.memory_slot_num)]
+    train_append_reader = reader_append_wrapper(
+        reader=paddle.dataset.wmt14.train(args.dict_size),
+        append_tuple=(bounded_memory_perturbation, ))
+    train_batch_reader = paddle.batch(
+        reader=paddle.reader.shuffle(reader=train_append_reader, buf_size=8192),
+        batch_size=args.batch_size)
+    test_append_reader = reader_append_wrapper(
+        reader=paddle.dataset.wmt14.test(args.dict_size),
+        append_tuple=(bounded_memory_perturbation, ))
+    test_batch_reader = paddle.batch(
+        reader=paddle.reader.shuffle(reader=test_append_reader, buf_size=8192),
+        batch_size=args.batch_size)
+    # create event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 10 == 0:
+                print "Pass: %d, Batch: %d, TrainCost: %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
+                    parameters.to_tar(f)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_batch_reader, feeding=feeding)
+            print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
+                                                  result.metrics)
+            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                parameters.to_tar(f)
+    # run train
+    if not os.path.exists('checkpoints'):
+        os.mkdir('checkpoints')
+    trainer.train(
+        reader=train_batch_reader,
+        event_handler=event_handler,
+        num_passes=args.num_passes,
+        feeding=feeding)
+def main():
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+    train()
+if __name__ == '__main__':
+    main()