From ae7ef7929a0bce79c5de03366840711e8e77f5b6 Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Sun, 10 Sep 2017 20:36:38 +0800
Subject: [PATCH] Rename some folders and update examples.

---
 data/librispeech/librispeech.py               |   2 +-
 data/tiny/tiny.py                             | 126 ++++++++++++++++++
 examples/librispeech/prepare_data.sh          |   2 +-
 examples/librispeech_tiny/prepare_data.sh     |  39 ------
 examples/tiny/run_data.sh                     |  45 +++++++
 .../{librispeech_tiny => tiny}/run_infer.sh   |  12 +-
 .../{librispeech_tiny => tiny}/run_test.sh    |   0
 .../{librispeech_tiny => tiny}/run_train.sh   |  20 +--
 .../{librispeech_tiny => tiny}/run_tune.sh    |   0
 infer.py                                      |   6 +-
 {lm => model_utils}/__init__.py               |   0
 {models => model_utils}/decoder.py            |   2 +
 {lm => model_utils}/lm_scorer.py              |   0
 {models => model_utils}/model.py              |   7 +-
 {models => model_utils}/network.py            |   0
 .../tests/test_decoders.py                    |   2 +-
 models/__init__.py                            |   0
 lm/run.sh => models/lm/download_en.sh         |   3 -
 test.py                                       |   6 +-
 tools/build_vocab.py                          |   6 +-
 tools/tune.py                                 |   6 +-
 train.py                                      |   4 +-
 22 files changed, 209 insertions(+), 79 deletions(-)
 create mode 100644 data/tiny/tiny.py
 delete mode 100644 examples/librispeech_tiny/prepare_data.sh
 create mode 100644 examples/tiny/run_data.sh
 rename examples/{librispeech_tiny => tiny}/run_infer.sh (58%)
 rename examples/{librispeech_tiny => tiny}/run_test.sh (100%)
 rename examples/{librispeech_tiny => tiny}/run_train.sh (56%)
 rename examples/{librispeech_tiny => tiny}/run_tune.sh (100%)
 rename {lm => model_utils}/__init__.py (100%)
 rename {models => model_utils}/decoder.py (99%)
 rename {lm => model_utils}/lm_scorer.py (100%)
 rename {models => model_utils}/model.py (97%)
 rename {models => model_utils}/network.py (100%)
 rename {models => model_utils}/tests/test_decoders.py (99%)
 delete mode 100644 models/__init__.py
 rename lm/run.sh => models/lm/download_en.sh (99%)

diff --git a/data/librispeech/librispeech.py b/data/librispeech/librispeech.py
index d963a7d5..14a3804e 100644
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@@ -41,7 +41,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default=DATA_HOME + "/libri",
     type=str,
     help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
diff --git a/data/tiny/tiny.py b/data/tiny/tiny.py
new file mode 100644
index 00000000..8ba2a13c
--- /dev/null
+++ b/data/tiny/tiny.py
@@ -0,0 +1,126 @@
+"""Prepare Librispeech ASR datasets.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils.util
+import os
+import sys
+import tarfile
+import argparse
+import soundfile
+import json
+import codecs
+from paddle.v2.dataset.common import md5file
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/tiny",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def download(url, md5sum, target_dir):
+    """
+    Download file from url to target_dir, and check md5sum.
+    """
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+
+
+def unpack(filepath, target_dir):
+    """
+    Unpack the file to the target_dir.
+    """
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+
+
+def create_manifest(data_dir, manifest_path):
+    """
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
+            for line in open(text_filepath):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(data_dir, subfolder,
+                                              segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """
+    Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+
+
+def main():
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/librispeech/prepare_data.sh b/examples/librispeech/prepare_data.sh
index a18402ea..6e999770 100644
--- a/examples/librispeech/prepare_data.sh
+++ b/examples/librispeech/prepare_data.sh
@@ -16,7 +16,7 @@ fi
 cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
 
 
-# build vocabulary (for English data, we can just skip this)
+# build vocabulary (can be skipped for English, as already provided)
 # python tools/build_vocab.py \
 # --count_threshold=0 \
 # --vocab_path='data/librispeech/eng_vocab.txt' \
diff --git a/examples/librispeech_tiny/prepare_data.sh b/examples/librispeech_tiny/prepare_data.sh
deleted file mode 100644
index a18402ea..00000000
--- a/examples/librispeech_tiny/prepare_data.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#! /usr/bin/bash
-
-pushd ../..
-
-# download data, generate manifests
-python data/librispeech/librispeech.py \
---manifest_prefix='data/librispeech/manifest' \
---full_download='True' \
---target_dir='~/.cache/paddle/dataset/speech/Libri'
-
-if [ $? -ne 0 ]; then
-    echo "Prepare LibriSpeech failed. Terminated."
-    exit 1
-fi
-
-cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
-
-
-# build vocabulary (for English data, we can just skip this)
-# python tools/build_vocab.py \
-# --count_threshold=0 \
-# --vocab_path='data/librispeech/eng_vocab.txt' \
-# --manifest_paths='data/librispeech/manifeset.train'
-
-
-# compute mean and stddev for normalizer
-python tools/compute_mean_std.py \
---manifest_path='data/librispeech/manifest.train' \
---num_samples=2000 \
---specgram_type='linear' \
---output_path='data/librispeech/mean_std.npz'
-
-if [ $? -ne 0 ]; then
-    echo "Compute mean and stddev failed. Terminated."
-    exit 1
-fi
-
-
-echo "LibriSpeech Data preparation done."
diff --git a/examples/tiny/run_data.sh b/examples/tiny/run_data.sh
new file mode 100644
index 00000000..44345d8c
--- /dev/null
+++ b/examples/tiny/run_data.sh
@@ -0,0 +1,45 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+# download data, generate manifests
+python data/tiny/tiny.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir=$HOME'/.cache/paddle/dataset/speech/tiny'
+
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+
+cat data/tiny/manifest.dev-clean | head -n 32 > data/tiny/manifest.train
+cat data/tiny/manifest.dev-clean | head -n 48 | tail -n 16 > data/tiny/manifest.dev
+cat data/tiny/manifest.dev-clean | head -n 64 | tail -n 16 > data/tiny/manifest.test
+
+
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.train'
+
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+
+
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.train' \
+--num_samples=32 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+
+
+echo "Tiny data preparation done."
diff --git a/examples/librispeech_tiny/run_infer.sh b/examples/tiny/run_infer.sh
similarity index 58%
rename from examples/librispeech_tiny/run_infer.sh
rename to examples/tiny/run_infer.sh
index 619d546e..f09bc663 100644
--- a/examples/librispeech_tiny/run_infer.sh
+++ b/examples/tiny/run_infer.sh
@@ -4,7 +4,7 @@ pushd ../..
 
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
---num_samples=10 \
+--num_samples=4 \
 --trainer_count=1 \
 --beam_size=500 \
 --num_proc_bsearch=12 \
@@ -17,11 +17,11 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
---infer_manifest='data/librispeech/manifest.dev-clean' \
---mean_std_path='data/librispeech/mean_std.npz' \
---vocab_path='data/librispeech/eng_vocab.txt' \
---model_path='checkpoints/params.latest.tar.gz' \
---lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--infer_manifest='data/tiny/manifest.train' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-14.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
diff --git a/examples/librispeech_tiny/run_test.sh b/examples/tiny/run_test.sh
similarity index 100%
rename from examples/librispeech_tiny/run_test.sh
rename to examples/tiny/run_test.sh
diff --git a/examples/librispeech_tiny/run_train.sh b/examples/tiny/run_train.sh
similarity index 56%
rename from examples/librispeech_tiny/run_train.sh
rename to examples/tiny/run_train.sh
index 14672167..7ca33687 100644
--- a/examples/librispeech_tiny/run_train.sh
+++ b/examples/tiny/run_train.sh
@@ -2,17 +2,17 @@
 
 pushd ../..
 
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1 \
 python -u train.py \
---batch_size=256 \
---trainer_count=8 \
---num_passes=50 \
---num_proc_data=12 \
+--batch_size=2 \
+--trainer_count=1 \
+--num_passes=10 \
+--num_proc_data=1 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --num_iter_print=100 \
---learning_rate=5e-4 \
+--learning_rate=5e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --use_sortagrad=True \
@@ -20,10 +20,10 @@ python -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
---train_manifest='data/librispeech/manifest.train' \
---dev_manifest='data/librispeech/manifest.dev' \
---mean_std_path='data/librispeech/mean_std.npz' \
---vocab_path='data/librispeech/eng_vocab.txt' \
+--train_manifest='data/tiny/manifest.train' \
+--dev_manifest='data/tiny/manifest.train' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
 --output_model_dir='./checkpoints' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
diff --git a/examples/librispeech_tiny/run_tune.sh b/examples/tiny/run_tune.sh
similarity index 100%
rename from examples/librispeech_tiny/run_tune.sh
rename to examples/tiny/run_tune.sh
diff --git a/infer.py b/infer.py
index 1ce969ae..73e200b4 100644
--- a/infer.py
+++ b/infer.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
 
@@ -35,10 +35,10 @@ add_arg('mean_std_path',    str,
         'data/librispeech/mean_std.npz',
         "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
         "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
         "Filepath for language model.")
 add_arg('model_path',       str,
         './checkpoints/params.latest.tar.gz',
diff --git a/lm/__init__.py b/model_utils/__init__.py
similarity index 100%
rename from lm/__init__.py
rename to model_utils/__init__.py
diff --git a/models/decoder.py b/model_utils/decoder.py
similarity index 99%
rename from models/decoder.py
rename to model_utils/decoder.py
index 61ead25c..ffba2731 100644
--- a/models/decoder.py
+++ b/model_utils/decoder.py
@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
                 prob = prob * ext_scoring_func(result)
             log_prob = log(prob)
             beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
 
     ## output top beam_size decoding results
     beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
diff --git a/lm/lm_scorer.py b/model_utils/lm_scorer.py
similarity index 100%
rename from lm/lm_scorer.py
rename to model_utils/lm_scorer.py
diff --git a/models/model.py b/model_utils/model.py
similarity index 97%
rename from models/model.py
rename to model_utils/model.py
index 93c4c41b..cf146f8c 100644
--- a/models/model.py
+++ b/model_utils/model.py
@@ -8,9 +8,10 @@ import os
 import time
 import gzip
 import paddle.v2 as paddle
-from lm.lm_scorer import LmScorer
-from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
-from models.network import deep_speech_v2_network
+from model_utils.lm_scorer import LmScorer
+from model_utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
+from model_utils.decoder import ctc_beam_search_decoder_batch
+from model_utils.network import deep_speech_v2_network
 
 
 class DeepSpeech2Model(object):
diff --git a/models/network.py b/model_utils/network.py
similarity index 100%
rename from models/network.py
rename to model_utils/network.py
diff --git a/models/tests/test_decoders.py b/model_utils/tests/test_decoders.py
similarity index 99%
rename from models/tests/test_decoders.py
rename to model_utils/tests/test_decoders.py
index acce46af..adf36eef 100644
--- a/models/tests/test_decoders.py
+++ b/model_utils/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-from models import decoder
+from model_utils import decoder
 
 
 class TestDecoders(unittest.TestCase):
diff --git a/models/__init__.py b/models/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lm/run.sh b/models/lm/download_en.sh
similarity index 99%
rename from lm/run.sh
rename to models/lm/download_en.sh
index 2108ea55..5ca33c67 100644
--- a/lm/run.sh
+++ b/models/lm/download_en.sh
@@ -14,6 +14,3 @@ if [ $MD5 != $md5_tmp ]; then
     echo "Fail to download the language model!"
     exit 1
 fi
-
-
-
diff --git a/test.py b/test.py
index 747e40df..791bfd58 100644
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
 
@@ -36,14 +36,14 @@ add_arg('mean_std_path',    str,
         'data/librispeech/mean_std.npz',
         "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
         "Filepath of vocabulary.")
 add_arg('model_path',       str,
         './checkpoints/params.latest.tar.gz',
         "If None, the training starts from scratch, "
         "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
         "Filepath for language model.")
 add_arg('decoding_method',  str,
         'ctc_beam_search',
diff --git a/tools/build_vocab.py b/tools/build_vocab.py
index ef9bde49..e167e92a 100644
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@@ -21,10 +21,8 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        None,
-        "Filepath to write the vocabulary.",
-        nargs='+',
-        required=True)
+        'data/librispeech/vocab.txt',
+        "Filepath to write the vocabulary.")
 add_arg('manifest_paths',   str,
         None,
         "Filepaths of manifests for building vocabulary. "
diff --git a/tools/tune.py b/tools/tune.py
index 7a237910..25e495f1 100644
--- a/tools/tune.py
+++ b/tools/tune.py
@@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments
 
@@ -41,10 +41,10 @@ add_arg('mean_std_path',    str,
         'data/librispeech/mean_std.npz',
         "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
         "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
         "Filepath for language model.")
 add_arg('model_path',       str,
         './checkpoints/params.latest.tar.gz',
diff --git a/train.py b/train.py
index 4a7a0eda..bbf1cd72 100644
--- a/train.py
+++ b/train.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments
 
@@ -41,7 +41,7 @@ add_arg('mean_std_path',    str,
         'data/librispeech/mean_std.npz',
         "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
         "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
         None,
-- 
GitLab