From 9ae22c345ad2265c7708f24e0fc148d3f0e37808 Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Thu, 25 May 2017 01:17:18 +0800
Subject: [PATCH] Add librispeech dataset, audio data provider and simplfied
 DeepSpeech2 model configuration. Bug exists when run training.

---
 deep_speech_2/README.md           |   8 +-
 deep_speech_2/audio_data_utils.py | 159 +++++++++++++++++++++++++
 deep_speech_2/eng_vocab.txt       |  28 +++++
 deep_speech_2/librispeech.py      |  97 +++++++++++++++
 deep_speech_2/requirements.sh     |   5 +
 deep_speech_2/train.py            | 188 ++++++++++++++++++++++++++++++
 6 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 deep_speech_2/audio_data_utils.py
 create mode 100644 deep_speech_2/eng_vocab.txt
 create mode 100644 deep_speech_2/librispeech.py
 create mode 100644 deep_speech_2/requirements.sh
 create mode 100644 deep_speech_2/train.py

diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md
index a0990367..fcadf568 100644
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -1 +1,7 @@
-TBD
+# Deep Speech 2 on PaddlePaddle
+
+```
+sh requirements.sh
+python librispeech.py
+python train.py
+```
diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py
new file mode 100644
index 00000000..2f7bfcf7
--- /dev/null
+++ b/deep_speech_2/audio_data_utils.py
@@ -0,0 +1,159 @@
+import paddle.v2 as paddle
+import logging
+import json
+import random
+import soundfile
+import numpy as np
+import os
+
+# TODO: add z-score normalization.
+
+ENGLISH_CHAR_VOCAB_FILEPATH = "eng_vocab.txt"
+
+logger = logging.getLogger(__name__)
+
+
+def spectrogram_from_file(filename,
+                          stride_ms=10,
+                          window_ms=20,
+                          max_freq=None,
+                          eps=1e-14):
+    """
+    Calculate the log of linear spectrogram from FFT energy
+    Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
+    """
+    audio, sample_rate = soundfile.read(filename)
+    if audio.ndim >= 2:
+        audio = np.mean(audio, 1)
+    if max_freq is None:
+        max_freq = sample_rate / 2
+    if max_freq > sample_rate / 2:
+        raise ValueError("max_freq must be greater than half of "
+                         "sample rate.")
+    if stride_ms > window_ms:
+        raise ValueError("Stride size must not be greater than window size.")
+    stride_size = int(0.001 * sample_rate * stride_ms)
+    window_size = int(0.001 * sample_rate * window_ms)
+    spectrogram, freqs = extract_spectrogram(
+        audio,
+        window_size=window_size,
+        stride_size=stride_size,
+        sample_rate=sample_rate)
+    ind = np.where(freqs <= max_freq)[0][-1] + 1
+    return np.log(spectrogram[:ind, :] + eps)
+
+
+def extract_spectrogram(samples, window_size, stride_size, sample_rate):
+    """
+    Compute the spectrogram for a real discrete signal.
+    Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
+    """
+    # extract strided windows
+    truncate_size = (len(samples) - window_size) % stride_size
+    samples = samples[:len(samples) - truncate_size]
+    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+    windows = np.lib.stride_tricks.as_strided(
+        samples, shape=nshape, strides=nstrides)
+    assert np.all(
+        windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+    # window weighting, compute squared Fast Fourier Transform (fft), scaling
+    weighting = np.hanning(window_size)[:, None]
+    fft = np.fft.rfft(windows * weighting, axis=0)
+    fft = np.absolute(fft)**2
+    scale = np.sum(weighting**2) * sample_rate
+    fft[1:-1, :] *= (2.0 / scale)
+    fft[(0, -1), :] /= scale
+    # prepare fft frequency list
+    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+    return fft, freqs
+
+
+def vocabulary_from_file(vocabulary_path):
+    """
+    Load vocabulary from file.
+    """
+    if os.path.exists(vocabulary_path):
+        vocab_lines = []
+        with open(vocabulary_path, 'r') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
+    else:
+        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
+
+
+def get_vocabulary_size():
+    vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
+    return len(vocab_dict)
+
+
+def parse_transcript(text, vocabulary):
+    """
+      Convert the transcript text string to list of token index integers..
+      """
+    return [vocabulary[w] for w in text]
+
+
+def reader_creator(manifest_path,
+                   sort_by_duration=True,
+                   shuffle=False,
+                   max_duration=10.0,
+                   min_duration=0.0):
+    if sort_by_duration and shuffle:
+        sort_by_duration = False
+        logger.warn("When shuffle set to true, "
+                    "sort_by_duration is forced to set False.")
+    vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
+
+    def reader():
+        # read manifest
+        manifest_data = []
+        for json_line in open(manifest_path):
+            try:
+                json_data = json.loads(json_line)
+            except Exception as e:
+                raise ValueError("Error reading manifest: %s" % str(e))
+            if (json_data["duration"] <= max_duration and
+                    json_data["duration"] >= min_duration):
+                manifest_data.append(json_data)
+        # sort (by duration) or shuffle manifest
+        if sort_by_duration:
+            manifest_data.sort(key=lambda x: x["duration"])
+        if shuffle:
+            random.shuffle(manifest_data)
+        # extract spectrogram feature
+        for instance in manifest_data:
+            spectrogram = spectrogram_from_file(instance["audio_filepath"])
+            text = parse_transcript(instance["text"], vocab_dict)
+            yield (spectrogram, text)
+
+    return reader
+
+
+def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True):
+    def padding_batch(batch):
+        new_batch = []
+        # get target shape within batch
+        nshape_list = [padding]
+        for audio, text in batch:
+            nshape_list.append(audio.shape)
+        target_shape = np.array(nshape_list).max(axis=0)
+        # padding
+        for audio, text in batch:
+            pad_shape = target_shape - audio.shape
+            assert np.all(pad_shape >= 0)
+            padded_audio = np.pad(
+                audio, [(0, pad_shape[0]), (0, pad_shape[1])], mode="constant")
+            if flatten:
+                padded_audio = padded_audio.flatten()
+            new_batch.append((padded_audio, text))
+        return new_batch
+
+    def new_batch_reader():
+        for batch in batch_reader():
+            yield padding_batch(batch)
+
+    return new_batch_reader
diff --git a/deep_speech_2/eng_vocab.txt b/deep_speech_2/eng_vocab.txt
new file mode 100644
index 00000000..8268f3f3
--- /dev/null
+++ b/deep_speech_2/eng_vocab.txt
@@ -0,0 +1,28 @@
+'
+ 
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
diff --git a/deep_speech_2/librispeech.py b/deep_speech_2/librispeech.py
new file mode 100644
index 00000000..fc7b9822
--- /dev/null
+++ b/deep_speech_2/librispeech.py
@@ -0,0 +1,97 @@
+import paddle.v2 as paddle
+import os
+import wget
+import tarfile
+import argparse
+import soundfile
+import json
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz"
+URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
+URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
+
+parser = argparse.ArgumentParser(
+    description='Downloads and prepare LibriSpeech dataset.')
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Libri",
+    type=str,
+    help="Directory to save the dataset.")
+parser.add_argument(
+    "--manifest",
+    default="./libri.manifest",
+    type=str,
+    help="Filepath prefix of output manifests.")
+args = parser.parse_args()
+
+
+def download(url, target_dir):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not os.path.exists(filepath):
+        print("Downloading %s ..." % url)
+        wget.download(url, target_dir)
+        print("")
+    return filepath
+
+
+def unpack(filepath, target_dir):
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    return target_dir
+
+
+def create_manifest(data_dir, manifest_path):
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in os.walk(data_dir):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
+            for line in open(text_filepath):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(data_dir, subfolder,
+                                              segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with open(manifest_path, 'w') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_dataset(url, target_dir, manifest_path):
+    filepath = download(url, target_dir)
+    unpacked_dir = unpack(filepath, target_dir)
+    create_manifest(unpacked_dir, manifest_path)
+
+
+def main():
+    prepare_dataset(
+        url=URL_TEST,
+        target_dir=os.path.join(args.target_dir),
+        manifest_path=args.manifest + ".test")
+    prepare_dataset(
+        url=URL_DEV,
+        target_dir=os.path.join(args.target_dir),
+        manifest_path=args.manifest + ".dev")
+    #prepare_dataset(url=URL_TRAIN,
+#target_dir=os.path.join(args.target_dir),
+#manifest_path=args.manifest + ".train")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/requirements.sh b/deep_speech_2/requirements.sh
new file mode 100644
index 00000000..7a089169
--- /dev/null
+++ b/deep_speech_2/requirements.sh
@@ -0,0 +1,5 @@
+pip install wget
+pip install soundfile
+
+# For Linux only
+apt-get install libsndfile1
diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py
new file mode 100644
index 00000000..083a718d
--- /dev/null
+++ b/deep_speech_2/train.py
@@ -0,0 +1,188 @@
+import paddle.v2 as paddle
+import audio_data_utils
+import argparse
+
+parser = argparse.ArgumentParser(
+    description='Simpled version of DeepSpeech2 trainer.')
+parser.add_argument(
+    "--batch_size", default=512, type=int, help="Minibatch size.")
+parser.add_argument("--trainer", default=1, type=int, help="Trainer number.")
+parser.add_argument(
+    "--num_passes", default=20, type=int, help="Training pass number.")
+args = parser.parse_args()
+
+
+def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
+                  padding, act):
+    conv_layer = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=num_channels_in,
+        num_filters=num_channels_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=conv_layer, act=act)
+
+
+def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
+    def __simple_rnn_step__(input):
+        last_state = paddle.layer.memory(name=name + "_state", size=size)
+        input_fc = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        input_fc_bn = paddle.layer.batch_norm(
+            input=input_fc, act=paddle.activation.Linear())
+        state_fc = paddle.layer.fc(
+            input=last_state,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        return paddle.layer.addto(
+            name=name + "_state", input=[input_fc_bn, state_fc], act=act)
+
+    forward = paddle.layer.recurrent_group(
+        step=__simple_rnn_step__, input=input)
+    return forward
+    # argument reverse is not exposed in V2 recurrent_group
+    #backward = paddle.layer.recurrent_group(
+
+
+#step=__simple_rnn_step__,
+#input=input,
+#reverse=True)
+#return paddle.layer.concat(input=[forward, backward])
+
+
+def conv_group(input):
+    conv1 = conv_bn_layer(
+        input=input,
+        filter_size=(11, 41),
+        num_channels_in=1,
+        num_channels_out=32,
+        stride=(3, 2),
+        padding=(5, 20),
+        act=paddle.activation.BRelu())
+    conv2 = conv_bn_layer(
+        input=conv1,
+        filter_size=(11, 21),
+        num_channels_in=32,
+        num_channels_out=32,
+        stride=(1, 2),
+        padding=(5, 10),
+        act=paddle.activation.BRelu())
+    conv3 = conv_bn_layer(
+        input=conv2,
+        filter_size=(11, 21),
+        num_channels_in=32,
+        num_channels_out=32,
+        stride=(1, 2),
+        padding=(5, 10),
+        act=paddle.activation.BRelu())
+    return conv3
+
+
+def rnn_group(input, size, num_stacks):
+    output = input
+    for i in xrange(num_stacks):
+        output = bidirectonal_simple_rnn_bn_layer(
+            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
+    return output
+
+
+def deep_speech2(audio_data, text_data, dict_size):
+    conv_group_output = conv_group(input=audio_data)
+    conv2seq = paddle.layer.block_expand(
+        input=conv_group_output,
+        num_channels=32,
+        stride_x=1,
+        stride_y=1,
+        block_x=1,
+        block_y=21)
+    rnn_group_output = rnn_group(input=conv2seq, size=256, num_stacks=5)
+    fc = paddle.layer.fc(
+        input=rnn_group_output,
+        size=dict_size + 1,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+    cost = paddle.layer.warp_ctc(
+        input=fc,
+        label=text_data,
+        size=dict_size + 1,
+        blank=dict_size,
+        norm_by_times=True)
+    return cost
+
+
+def train():
+    # create network config
+    dict_size = audio_data_utils.get_vocabulary_size()
+    audio_data = paddle.layer.data(
+        name="audio_spectrogram",
+        height=161,
+        width=1000,
+        type=paddle.data_type.dense_vector(161000))
+    text_data = paddle.layer.data(
+        name="transcript_text",
+        type=paddle.data_type.integer_value_sequence(dict_size))
+    cost = deep_speech2(audio_data, text_data, dict_size)
+
+    # create parameters and optimizer
+    parameters = paddle.parameters.create(cost)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=5e-5,
+        gradient_clipping_threshold=5,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+    return
+
+    # create data readers
+    feeding = {
+        "audio_spectrogram": 0,
+        "transcript_text": 1,
+    }
+    train_batch_reader = audio_data_utils.padding_batch_reader(
+        paddle.batch(
+            audio_data_utils.reader_creator("./libri.manifest.dev"),
+            batch_size=args.batch_size // args.trainer),
+        padding=[-1, 1000])
+    test_batch_reader = audio_data_utils.padding_batch_reader(
+        paddle.batch(
+            audio_data_utils.reader_creator("./libri.manifest.test"),
+            batch_size=args.batch_size // args.trainer),
+        padding=[-1, 1000])
+
+    # create event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 10 == 0:
+                print "Pass: %d, Batch: %d, TrainCost: %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_batch_reader, feeding=feeding)
+            print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
+                                                  result.metrics)
+            with gzip.open("params.tar.gz", 'w') as f:
+                parameters.to_tar(f)
+
+    # run train
+    trainer.train(
+        reader=train_batch_reader,
+        event_handler=event_handler,
+        num_passes=10,
+        feeding=feeding)
+
+
+def main():
+    train()
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab