提交 7739b52e 编写于 作者: X Xinghai Sun

Add function docs.

上级 47b706cc
"""
Audio data preprocessing tools and reader creators.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import logging import logging
import json import json
...@@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path): ...@@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path):
def get_vocabulary_size(): def get_vocabulary_size():
"""
Get vocabulary size.
"""
vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
return len(vocab_dict) return len(vocab_dict)
def get_vocabulary(): def get_vocabulary():
"""
Get vocabulary.
"""
return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
def parse_transcript(text, vocabulary): def parse_transcript(text, vocabulary):
""" """
Convert the transcript text string to list of token index integers.. Convert the transcript text string to list of token index integers.
""" """
return [vocabulary[w] for w in text] return [vocabulary[w] for w in text]
...@@ -106,6 +115,28 @@ def reader_creator(manifest_path, ...@@ -106,6 +115,28 @@ def reader_creator(manifest_path,
shuffle=False, shuffle=False,
max_duration=10.0, max_duration=10.0,
min_duration=0.0): min_duration=0.0):
"""
Audio data reader creator.
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
tokenized transcription text.
:param manifest_path: Filepath for Manifest of audio clip files.
:type manifest_path: basestring
:param sort_by_duration: Sort the audio clips by duration if set True.
For SortaGrad.
:type sort_by_duration: bool
:param shuffle: Shuffle the audio clips if set True.
:type shuffle: bool
:param max_duration: Audio clips with duration (in seconds) greater than
this will be discarded.
:type max_duration: float
:param min_duration: Audio clips with duration (in seconds) smaller than
this will be discarded.
:type min_duration: float
:return: Data reader function.
:rtype: callable
"""
if sort_by_duration and shuffle: if sort_by_duration and shuffle:
sort_by_duration = False sort_by_duration = False
logger.warn("When shuffle set to true, " logger.warn("When shuffle set to true, "
...@@ -138,6 +169,27 @@ def reader_creator(manifest_path, ...@@ -138,6 +169,27 @@ def reader_creator(manifest_path,
def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True): def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True):
"""
Padding for batches. Return a batch reader.
Each instance in a batch will be padded to be of a same target shape.
The target shape is the largest shape among all the batch instances and
'padding' argument. Therefore, if padding is set [-1, -1], instance will be
padded to have the same shape just within each batch and the shape will
be different across batches; if padding is set
[VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
:param batch_reader: Input batch reader.
:type batch_reader: callable
:param padding: Padding pattern. Details please refer to the above.
:type padding: list
:param flatten: Flatten the tensor to be one dimension.
:type flatten: bool
:return: Batch reader function.
:rtype: callable
"""
def padding_batch(batch): def padding_batch(batch):
new_batch = [] new_batch = []
# get target shape within batch # get target shape within batch
......
"""
Inference for a simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import audio_data_utils from itertools import groupby
import argparse import argparse
from model import deep_speech2
import gzip import gzip
from itertools import groupby import audio_data_utils
from model import deep_speech2
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Simpled version of DeepSpeech2 inference.') description='Simplified version of DeepSpeech2 inference.')
parser.add_argument( parser.add_argument(
"--num_samples", default=10, type=int, help="Number of inference samples.") "--num_samples",
default=10,
type=int,
help="Number of samples for inference.")
parser.add_argument( parser.add_argument(
"--num_conv_layers", default=2, type=int, help="Convolution layer number.") "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
parser.add_argument( parser.add_argument(
...@@ -21,13 +28,21 @@ args = parser.parse_args() ...@@ -21,13 +28,21 @@ args = parser.parse_args()
def remove_duplicate_and_blank(id_list, blank_id): def remove_duplicate_and_blank(id_list, blank_id):
"""
Postprocessing for max-ctc-decoder.
- remove consecutive duplicate tokens.
- remove blanks.
"""
# remove consecutive duplicate tokens # remove consecutive duplicate tokens
id_list = [x[0] for x in groupby(id_list)] id_list = [x[0] for x in groupby(id_list)]
# remove blank # remove blanks
return [id for id in id_list if id != blank_id] return [id for id in id_list if id != blank_id]
def max_infer(): def max_infer():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# create network config # create network config
_, vocab_list = audio_data_utils.get_vocabulary() _, vocab_list = audio_data_utils.get_vocabulary()
dict_size = len(vocab_list) dict_size = len(vocab_list)
...@@ -64,7 +79,7 @@ def max_infer(): ...@@ -64,7 +79,7 @@ def max_infer():
padding=[-1, 1000]) padding=[-1, 1000])
infer_data = test_batch_reader().next() infer_data = test_batch_reader().next()
# run inference # run max-ctc-decoding
max_id_results = paddle.infer( max_id_results = paddle.infer(
output_layer=max_id, output_layer=max_id,
parameters=parameters, parameters=parameters,
......
"""
Download, unpack and create manifest for Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import os import os
import wget import wget
...@@ -88,9 +96,10 @@ def main(): ...@@ -88,9 +96,10 @@ def main():
url=URL_DEV, url=URL_DEV,
target_dir=os.path.join(args.target_dir), target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".dev") manifest_path=args.manifest + ".dev")
#prepare_dataset(url=URL_TRAIN, prepare_dataset(
#target_dir=os.path.join(args.target_dir), url=URL_TRAIN,
#manifest_path=args.manifest + ".train") target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".train")
if __name__ == '__main__': if __name__ == '__main__':
......
"""
A simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
#TODO: add bidirectional rnn.
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act): padding, act):
"""
Convolution layer with batch normalization.
"""
conv_layer = paddle.layer.img_conv( conv_layer = paddle.layer.img_conv(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
...@@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, ...@@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
def bidirectonal_simple_rnn_bn_layer(name, input, size, act): def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
"""
Bidirectonal simple rnn layer with batch normalization.
The batch normalization is only performed on input-state projection
(sequence-wise normalization).
Question: does mean and variance statistics computed over the whole sequence
or just on each individual time steps?
"""
def __simple_rnn_step__(input): def __simple_rnn_step__(input):
last_state = paddle.layer.memory(name=name + "_state", size=size) last_state = paddle.layer.memory(name=name + "_state", size=size)
input_fc = paddle.layer.fc( input_fc = paddle.layer.fc(
...@@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act): ...@@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
size=size, size=size,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=False) bias_attr=False)
# batch norm is only performed on input-state projection
input_fc_bn = paddle.layer.batch_norm( input_fc_bn = paddle.layer.batch_norm(
input=input_fc, act=paddle.activation.Linear()) input=input_fc, act=paddle.activation.Linear())
state_fc = paddle.layer.fc( state_fc = paddle.layer.fc(
...@@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act): ...@@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
def conv_group(input, num_stacks): def conv_group(input, num_stacks):
"""
Convolution group with several stacking convolution layers.
"""
conv = conv_bn_layer( conv = conv_bn_layer(
input=input, input=input,
filter_size=(11, 41), filter_size=(11, 41),
...@@ -68,6 +90,9 @@ def conv_group(input, num_stacks): ...@@ -68,6 +90,9 @@ def conv_group(input, num_stacks):
def rnn_group(input, size, num_stacks): def rnn_group(input, size, num_stacks):
"""
RNN group with several stacking RNN layers.
"""
output = input output = input
for i in xrange(num_stacks): for i in xrange(num_stacks):
output = bidirectonal_simple_rnn_bn_layer( output = bidirectonal_simple_rnn_bn_layer(
...@@ -81,7 +106,27 @@ def deep_speech2(audio_data, ...@@ -81,7 +106,27 @@ def deep_speech2(audio_data,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=256): rnn_size=256):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: Tuple of the cost layer and the max_id decoder layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand( conv2seq = paddle.layer.block_expand(
input=conv_group_output, input=conv_group_output,
num_channels=32, num_channels=32,
...@@ -89,18 +134,22 @@ def deep_speech2(audio_data, ...@@ -89,18 +134,22 @@ def deep_speech2(audio_data,
stride_y=1, stride_y=1,
block_x=1, block_x=1,
block_y=21) block_y=21)
# rnn group
rnn_group_output = rnn_group( rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
# output token distribution
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=rnn_group_output, input=rnn_group_output,
size=dict_size + 1, size=dict_size + 1,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=True) bias_attr=True)
# ctc cost
cost = paddle.layer.warp_ctc( cost = paddle.layer.warp_ctc(
input=fc, input=fc,
label=text_data, label=text_data,
size=dict_size + 1, size=dict_size + 1,
blank=dict_size, blank=dict_size,
norm_by_times=True) norm_by_times=True)
# max decoder
max_id = paddle.layer.max_id(input=fc) max_id = paddle.layer.max_id(input=fc)
return cost, max_id return cost, max_id
"""
Trainer for a simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import audio_data_utils
import argparse import argparse
from model import deep_speech2
import gzip import gzip
import sys
from model import deep_speech2
import audio_data_utils
#TODO: add WER metric
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Simpled version of DeepSpeech2 trainer.') description='Simplified version of DeepSpeech2 trainer.')
parser.add_argument( parser.add_argument(
"--batch_size", default=512, type=int, help="Minibatch size.") "--batch_size", default=512, type=int, help="Minibatch size.")
parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument("--trainer", default=1, type=int, help="Trainer number.")
parser.add_argument( parser.add_argument(
"--num_passes", default=20, type=int, help="Training pass number.") "--num_passes", default=20, type=int, help="Training pass number.")
parser.add_argument( parser.add_argument(
"--num_conv_layers", default=2, type=int, help="Convolution layer number.") "--num_conv_layers", default=3, type=int, help="Convolution layer number.")
parser.add_argument( parser.add_argument(
"--num_rnn_layers", default=3, type=int, help="RNN layer number.") "--num_rnn_layers", default=5, type=int, help="RNN layer number.")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.")
parser.add_argument( parser.add_argument(
...@@ -25,6 +32,9 @@ args = parser.parse_args() ...@@ -25,6 +32,9 @@ args = parser.parse_args()
def train(): def train():
"""
DeepSpeech2 training.
"""
# create network config # create network config
dict_size = audio_data_utils.get_vocabulary_size() dict_size = audio_data_utils.get_vocabulary_size()
audio_data = paddle.layer.data( audio_data = paddle.layer.data(
...@@ -89,8 +99,7 @@ def train(): ...@@ -89,8 +99,7 @@ def train():
sys.stdout.flush() sys.stdout.flush()
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_batch_reader, feeding=feeding) result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost, print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics)
result.metrics)
with gzip.open("params.tar.gz", 'w') as f: with gzip.open("params.tar.gz", 'w') as f:
parameters.to_tar(f) parameters.to_tar(f)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册