Merge pull request #69 from xinghai-sun/ds2_mkdir_data

Refactor decoder interfaces and add ./data directory.

Merge pull request #69 from xinghai-sun/ds2_mkdir_data
Refactor decoder interfaces and add ./data directory.
5974ea96 · Yibing Liu · GitHub · ec9cce9e · e17f845c · 5974ea96
7 changed file
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
 # Deep Speech 2 on PaddlePaddle

-## Quick Start
+## Installation

-### Installation
-
-Please replace `$PADDLE_INSTALL_DIR` with your paddle installation directory.
+Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.

 ```
 pip install -r requirements.txt
@@ -13,10 +11,14 @@ export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/li

 For some machines, we also need to install libsndfile1. Details to be added.

-### Preparing Dataset(s)
+## Usage
+
+### Preparing Data

 ```
+cd data
 python librispeech.py
+cd ..
 ```

 More help for arguments:

--- a/deep_speech_2/eng_vocab.txt
+++ b/deep_speech_2/eng_vocab.txt
--- a/deep_speech_2/librispeech.py
+++ b/deep_speech_2/librispeech.py
@@ -15,7 +15,7 @@ import argparse
 import soundfile
 import json

-DATA_HOME = os.path.expanduser('~/.cache2/paddle/dataset/speech')
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
@@ -30,7 +30,7 @@ MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
 MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
 MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
 MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
-MD5_TRAIN_CLEAN_500 = "d1a0fd59409feb2c614ce4d30c387708"
+MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"

 parser = argparse.ArgumentParser(
    description='Downloads and prepare LibriSpeech dataset.')

--- a/deep_speech_2/decoder.py
+++ b/deep_speech_2/decoder.py
+"""
+    CTC-like decoder utilitis.
+"""
+
+from itertools import groupby
+import numpy as np
+
+
+def ctc_best_path_decode(probs_seq, vocabulary):
+    """
+    Best path decoding, also called argmax decoding or greedy decoding.
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+
+
+def ctc_decode(probs_seq, vocabulary, method):
+    """
+    CTC-like sequence decoding from a sequence of likelihood probablilites. 
+
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param method: Decoding method name, with options: "best_path".
+    :type method: basestring
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("probs dimension mismatchedd with vocabulary")
+    if method == "best_path":
+        return ctc_best_path_decode(probs_seq, vocabulary)
+    else:
+        raise ValueError("Decoding method [%s] is not supported.")
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -3,12 +3,12 @@
 """

 import paddle.v2 as paddle
-from itertools import groupby
 import distutils.util
 import argparse
 import gzip
 from audio_data_utils import DataGenerator
 from model import deep_speech2
+from decoder import ctc_decode

 parser = argparse.ArgumentParser(
    description='Simplified version of DeepSpeech2 inference.')
@@ -39,12 +39,12 @@ parser.add_argument(
    help="Use gpu or not. (default: %(default)s)")
 parser.add_argument(
    "--normalizer_manifest_path",
-    default='./manifest.libri.train-clean-100',
+    default='data/manifest.libri.train-clean-100',
    type=str,
    help="Manifest path for normalizer. (default: %(default)s)")
 parser.add_argument(
    "--decode_manifest_path",
-    default='./manifest.libri.test-clean',
+    default='data/manifest.libri.test-clean',
    type=str,
    help="Manifest path for decoding. (default: %(default)s)")
 parser.add_argument(
@@ -52,34 +52,28 @@ parser.add_argument(
    default='./params.tar.gz',
    type=str,
    help="Model filepath. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='data/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
 args = parser.parse_args()


-def remove_duplicate_and_blank(id_list, blank_id):
-    """
-    Postprocessing for max-ctc-decoder.
-    - remove consecutive duplicate tokens.
-    - remove blanks.
-    """
-    # remove consecutive duplicate tokens
-    id_list = [x[0] for x in groupby(id_list)]
-    # remove blanks
-    return [id for id in id_list if id != blank_id]
-
-
-def best_path_decode():
+def infer():
    """
    Max-ctc-decoding for DeepSpeech2.
    """
    # initialize data generator
    data_generator = DataGenerator(
-        vocab_filepath='eng_vocab.txt',
+        vocab_filepath=args.vocab_filepath,
        normalizer_manifest_path=args.normalizer_manifest_path,
        normalizer_num_samples=200,
        max_duration=20.0,
        min_duration=0.0,
        stride_ms=10,
        window_ms=20)
+
    # create network config
    dict_size = data_generator.vocabulary_size()
    vocab_list = data_generator.vocabulary_list()
@@ -91,13 +85,14 @@ def best_path_decode():
    text_data = paddle.layer.data(
        name="transcript_text",
        type=paddle.data_type.integer_value_sequence(dict_size))
-    _, max_id = deep_speech2(
+    output_probs = deep_speech2(
        audio_data=audio_data,
        text_data=text_data,
        dict_size=dict_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size)
+        rnn_size=args.rnn_layer_size,
+        is_inference=True)

    # load parameters
    parameters = paddle.parameters.Parameters.from_tar(
@@ -114,30 +109,28 @@ def best_path_decode():
        shuffle=False)
    infer_data = test_batch_reader().next()

-    # run max-ctc-decoding
-    max_id_results = paddle.infer(
-        output_layer=max_id,
-        parameters=parameters,
-        input=infer_data,
-        field=['id'])
-
-    # postprocess
-    instance_length = len(max_id_results) / args.num_samples
-    instance_list = [
-        max_id_results[i * instance_length:(i + 1) * instance_length]
-        for i in xrange(0, args.num_samples)
+    # run inference
+    infer_results = paddle.infer(
+        output_layer=output_probs, parameters=parameters, input=infer_data)
+    num_steps = len(infer_results) / len(infer_data)
+    probs_split = [
+        infer_results[i * num_steps:(i + 1) * num_steps]
+        for i in xrange(0, len(infer_data))
    ]
-    for i, instance in enumerate(instance_list):
-        id_list = remove_duplicate_and_blank(instance, dict_size)
-        output_transcript = ''.join([vocab_list[id] for id in id_list])
-        target_transcript = ''.join([vocab_list[id] for id in infer_data[i][1]])
-        print("Target Transcript: %s \nOutput Transcript: %s \n" %
-              (target_transcript, output_transcript))
+
+    # decode and print
+    for i, probs in enumerate(probs_split):
+        output_transcription = ctc_decode(
+            probs_seq=probs, vocabulary=vocab_list, method="best_path")
+        target_transcription = ''.join(
+            [vocab_list[index] for index in infer_data[i][1]])
+        print("Target Transcription: %s \nOutput Transcription: %s \n" %
+              (target_transcription, output_transcription))


 def main():
    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
-    best_path_decode()
+    infer()


 if __name__ == '__main__':

--- a/deep_speech_2/model.py
+++ b/deep_speech_2/model.py
@@ -85,7 +85,8 @@ def deep_speech2(audio_data,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
-                 rnn_size=256):
+                 rnn_size=256,
+                 is_inference=False):
    """
    The whole DeepSpeech2 model structure (a simplified version).

@@ -101,7 +102,12 @@ def deep_speech2(audio_data,
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
-    :return: Tuple of the cost layer and the max_id decoder layer.
+    :param is_inference: False in the training mode, and True in the
+                         inferene mode.
+    :type is_inference: bool
+    :return: If is_inference set False, return a ctc cost layer;
+             if is_inference set True, return a sequence layer of output
+             probability distribution.
    :rtype: tuple of LayerOutput
    """
    # convolution group
@@ -118,19 +124,21 @@ def deep_speech2(audio_data,
    # rnn group
    rnn_group_output = rnn_group(
        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
-    # output token distribution
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
-    # ctc cost
-    cost = paddle.layer.warp_ctc(
-        input=fc,
-        label=text_data,
-        size=dict_size + 1,
-        blank=dict_size,
-        norm_by_times=True)
-    # max decoder
-    max_id = paddle.layer.max_id(input=fc)
-    return cost, max_id
+    if is_inference:
+        # probability distribution with softmax
+        return paddle.layer.mixed(
+            input=paddle.layer.identity_projection(input=fc),
+            act=paddle.activation.Softmax())
+    else:
+        # ctc cost
+        return paddle.layer.warp_ctc(
+            input=fc,
+            label=text_data,
+            size=dict_size + 1,
+            blank=dict_size,
+            norm_by_times=True)
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -60,19 +60,24 @@ parser.add_argument(
    help="Trainer number. (default: %(default)s)")
 parser.add_argument(
    "--normalizer_manifest_path",
-    default='./manifest.libri.train-clean-100',
+    default='data/manifest.libri.train-clean-100',
    type=str,
    help="Manifest path for normalizer. (default: %(default)s)")
 parser.add_argument(
    "--train_manifest_path",
-    default='./manifest.libri.train-clean-100',
+    default='data/manifest.libri.train-clean-100',
    type=str,
    help="Manifest path for training. (default: %(default)s)")
 parser.add_argument(
    "--dev_manifest_path",
-    default='./manifest.libri.dev-clean',
+    default='data/manifest.libri.dev-clean',
    type=str,
    help="Manifest path for validation. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='data/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
 args = parser.parse_args()


@@ -82,7 +87,7 @@ def train():
    """
    # initialize data generator
    data_generator = DataGenerator(
-        vocab_filepath='eng_vocab.txt',
+        vocab_filepath=args.vocab_filepath,
        normalizer_manifest_path=args.normalizer_manifest_path,
        normalizer_num_samples=200,
        max_duration=20.0,
@@ -100,13 +105,14 @@ def train():
    text_data = paddle.layer.data(
        name="transcript_text",
        type=paddle.data_type.integer_value_sequence(dict_size))
-    cost, _ = deep_speech2(
+    cost = deep_speech2(
        audio_data=audio_data,
        text_data=text_data,
        dict_size=dict_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size)
+        rnn_size=args.rnn_layer_size,
+        is_inference=False)

    # create parameters and optimizer
    parameters = paddle.parameters.create(cost)
@@ -118,21 +124,21 @@ def train():
    # prepare data reader
    train_batch_reader_sortagrad = data_generator.batch_reader_creator(
        manifest_path=args.train_manifest_path,
-        batch_size=args.batch_size // args.trainer_count,
+        batch_size=args.batch_size,
        padding_to=2000,
        flatten=True,
        sort_by_duration=True,
        shuffle=False)
    train_batch_reader_nosortagrad = data_generator.batch_reader_creator(
        manifest_path=args.train_manifest_path,
-        batch_size=args.batch_size // args.trainer_count,
+        batch_size=args.batch_size,
        padding_to=2000,
        flatten=True,
        sort_by_duration=False,
        shuffle=True)
    test_batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.dev_manifest_path,
-        batch_size=args.batch_size // args.trainer_count,
+        batch_size=args.batch_size,
        padding_to=2000,
        flatten=True,
        sort_by_duration=False,
@@ -141,9 +147,7 @@ def train():

    # create event handler
    def event_handler(event):
-        global start_time
-        global cost_sum
-        global cost_counter
+        global start_time, cost_sum, cost_counter
        if isinstance(event, paddle.event.EndIteration):
            cost_sum += event.cost
            cost_counter += 1