提交 8122dd9c 编写于 作者: X Xinghai Sun

Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class.

上级 92eacf54
...@@ -4,14 +4,11 @@ from __future__ import division ...@@ -4,14 +4,11 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import distutils.util import distutils.util
import sys
import argparse import argparse
import gzip import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils import utils
...@@ -119,37 +116,12 @@ args = parser.parse_args() ...@@ -119,37 +116,12 @@ args = parser.parse_args()
def evaluate(): def evaluate():
"""Evaluate on whole test data for DeepSpeech2.""" """Evaluate on whole test data for DeepSpeech2."""
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.decode_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
...@@ -157,59 +129,34 @@ def evaluate(): ...@@ -157,59 +129,34 @@ def evaluate():
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# define inferer ds2_model = DeepSpeech2Model(
inferer = paddle.inference.Inference( vocab_size=data_generator.vocab_size,
output_layer=output_probs, parameters=parameters) num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
# initialize external scorer for beam search decoding rnn_layer_size=args.rnn_layer_size,
if args.decode_method == 'beam_search': pretrained_model_path=args.model_filepath)
ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
wer_counter, wer_sum = 0, 0.0 wer_sum, num_ins = 0.0, 0
for infer_data in batch_reader(): for infer_data in batch_reader():
# run inference result_transcripts = ds2_model.infer_batch(
infer_results = inferer.infer(input=infer_data) infer_data=infer_data,
num_steps = len(infer_results) // len(infer_data) decode_method=args.decode_method,
probs_split = [ beam_alpha=args.alpha,
infer_results[i * num_steps:(i + 1) * num_steps] beam_beta=args.beta,
for i in xrange(0, len(infer_data)) beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data
] ]
# target transcription for target, result in zip(target_transcripts, result_transcripts):
target_transcription = [ wer_sum += wer(target, result)
''.join([ num_ins += 1
data_generator.vocab_list[index] for index in infer_data[i][1] print("WER (%d/?) = %f" % (num_ins, wer_sum / num_ins))
]) for i, probs in enumerate(probs_split) print("Final WER (%d/%d) = %f" % (num_ins, num_ins, wer_sum / num_ins))
]
# decode and print
# best path decode
if args.decode_method == "best_path":
for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder(
probs_seq=probs, vocabulary=data_generator.vocab_list)
wer_sum += wer(target_transcription[i], output_transcription)
wer_counter += 1
# beam search decode
elif args.decode_method == "beam_search":
# beam search using multiple processes
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size,
blank_id=len(data_generator.vocab_list),
num_processes=args.num_processes_beam_search,
ext_scoring_func=ext_scorer,
cutoff_prob=args.cutoff_prob)
for i, beam_search_result in enumerate(beam_search_results):
wer_sum += wer(target_transcription[i],
beam_search_result[0][1])
wer_counter += 1
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
print("WER (%d/?) = %f" % (wer_counter, wer_sum / wer_counter))
print("Final WER (%d/%d) = %f" % (wer_counter, wer_counter,
wer_sum / wer_counter))
def main(): def main():
......
...@@ -4,14 +4,11 @@ from __future__ import division ...@@ -4,14 +4,11 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import gzip
import distutils.util import distutils.util
import multiprocessing import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils import utils
...@@ -124,37 +121,12 @@ args = parser.parse_args() ...@@ -124,37 +121,12 @@ args = parser.parse_args()
def infer(): def infer():
"""Inference for DeepSpeech2.""" """Inference for DeepSpeech2."""
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.decode_manifest_path,
batch_size=args.num_samples, batch_size=args.num_samples,
...@@ -163,61 +135,31 @@ def infer(): ...@@ -163,61 +135,31 @@ def infer():
shuffle_method=None) shuffle_method=None)
infer_data = batch_reader().next() infer_data = batch_reader().next()
# run inference ds2_model = DeepSpeech2Model(
infer_results = paddle.infer( vocab_size=data_generator.vocab_size,
output_layer=output_probs, parameters=parameters, input=infer_data) num_conv_layers=args.num_conv_layers,
num_steps = len(infer_results) // len(infer_data) num_rnn_layers=args.num_rnn_layers,
probs_split = [ rnn_layer_size=args.rnn_layer_size,
infer_results[i * num_steps:(i + 1) * num_steps] pretrained_model_path=args.model_filepath)
for i in xrange(len(infer_data)) result_transcripts = ds2_model.infer_batch(
] infer_data=infer_data,
decode_method=args.decode_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
# targe transcription target_transcripts = [
target_transcription = [ ''.join([data_generator.vocab_list[token] for token in transcript])
''.join( for _, transcript in infer_data
[data_generator.vocab_list[index] for index in infer_data[i][1]])
for i, probs in enumerate(probs_split)
] ]
for target, result in zip(target_transcripts, result_transcripts):
## decode and print print("\nTarget Transcription: %s\nOutput Transcription: %s" %
# best path decode (target, result))
wer_sum, wer_counter = 0, 0 print("Current wer = %f" % wer(target, result))
if args.decode_method == "best_path":
for i, probs in enumerate(probs_split):
best_path_transcription = ctc_best_path_decoder(
probs_seq=probs, vocabulary=data_generator.vocab_list)
print("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target_transcription[i], best_path_transcription))
wer_cur = wer(target_transcription[i], best_path_transcription)
wer_sum += wer_cur
wer_counter += 1
print("cur wer = %f, average wer = %f" %
(wer_cur, wer_sum / wer_counter))
# beam search decode
elif args.decode_method == "beam_search":
ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
beam_search_batch_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size,
blank_id=len(data_generator.vocab_list),
num_processes=args.num_processes_beam_search,
cutoff_prob=args.cutoff_prob,
ext_scoring_func=ext_scorer, )
for i, beam_search_result in enumerate(beam_search_batch_results):
print("\nTarget Transcription:\t%s" % target_transcription[i])
for index in xrange(args.num_results_per_sample):
result = beam_search_result[index]
#output: index, log prob, beam result
print("Beam %d: %f \t%s" % (index, result[0], result[1]))
wer_cur = wer(target_transcription[i], beam_search_result[0][1])
wer_sum += wer_cur
wer_counter += 1
print("Current WER = %f , Average WER = %f" %
(wer_cur, wer_sum / wer_counter))
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
def main(): def main():
......
"""Contains DeepSpeech2 layers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.v2 as paddle
DISABLE_CUDNN_BATCH_NORM = True
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
"""
Convolution layer with batch normalization.
"""
conv_layer = paddle.layer.img_conv(
input=input,
filter_size=filter_size,
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
if DISABLE_CUDNN_BATCH_NORM:
# temopary patch, need to be removed.
return paddle.layer.batch_norm(
input=conv_layer, act=act, batch_norm_type="batch_norm")
else:
return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
"""
Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
# batch norm is only performed on input-state projection
if DISABLE_CUDNN_BATCH_NORM:
# temopary patch, need to be removed.
input_proj_bn = paddle.layer.batch_norm(
input=input_proj,
act=paddle.activation.Linear(),
batch_norm_type="batch_norm")
else:
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def conv_group(input, num_stacks):
"""
Convolution group with several stacking convolution layers.
"""
conv = conv_bn_layer(
input=input,
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
filter_size=(11, 21),
num_channels_in=32,
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks):
"""
RNN group with several stacking RNN layers.
"""
output = input
for i in xrange(num_stacks):
output = bidirectional_simple_rnn_bn_layer(
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
return output
def deep_speech2(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param is_inference: False in the training mode, and True in the
inferene mode.
:type is_inference: bool
:return: If is_inference set False, return a ctc cost layer;
if is_inference set True, return a sequence layer of output
probability distribution.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
num_channels=conv_group_num_channels,
stride_x=1,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
act=paddle.activation.Linear(),
bias_attr=True)
# probability distribution with softmax
log_probs = paddle.layer.mixed(
input=paddle.layer.identity_projection(input=fc),
act=paddle.activation.Softmax())
# ctc cost
ctc_loss = paddle.layer.warp_ctc(
input=fc,
label=text_data,
size=dict_size + 1,
blank=dict_size,
norm_by_times=True)
return log_probs, ctc_loss
...@@ -3,141 +3,150 @@ from __future__ import absolute_import ...@@ -3,141 +3,150 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import sys
import os
import time
import gzip
from decoder import *
from lm.lm_scorer import LmScorer
import paddle.v2 as paddle import paddle.v2 as paddle
from layer import *
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, class DeepSpeech2Model(object):
padding, act): def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
""" rnn_layer_size, pretrained_model_path):
Convolution layer with batch normalization. self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
""" rnn_layer_size)
conv_layer = paddle.layer.img_conv( self._create_parameters(pretrained_model_path)
input=input, self._inferer = None
filter_size=filter_size, self._ext_scorer = None
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
def train(self,
train_batch_reader,
dev_batch_reader,
feeding_dict,
learning_rate,
gradient_clipping,
num_passes,
num_iterations_print=100,
output_model_dir='checkpoints'):
# prepare optimizer and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=learning_rate,
gradient_clipping_threshold=gradient_clipping)
trainer = paddle.trainer.SGD(
cost=self._loss,
parameters=self._parameters,
update_equation=optimizer)
def bidirectional_simple_rnn_bn_layer(name, input, size, act): # create event handler
""" def event_handler(event):
Bidirectonal simple rnn layer with sequence-wise batch normalization. global start_time, cost_sum, cost_counter
The batch normalization is only performed on input-state weights. if isinstance(event, paddle.event.EndIteration):
""" cost_sum += event.cost
# input-hidden weights shared across bi-direcitonal rnn. cost_counter += 1
input_proj = paddle.layer.fc( if (event.batch_id + 1) % num_iterations_print == 0:
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) output_model_path = os.path.join(output_model_dir,
# batch norm is only performed on input-state projection "params.latest.tar.gz")
input_proj_bn = paddle.layer.batch_norm( with gzip.open(output_model_path, 'w') as f:
input=input_proj, act=paddle.activation.Linear()) self._parameters.to_tar(f)
# forward and backward in time print("\nPass: %d, Batch: %d, TrainCost: %f" %
forward_simple_rnn = paddle.layer.recurrent( (event.pass_id, event.batch_id + 1,
input=input_proj_bn, act=act, reverse=False) cost_sum / cost_counter))
backward_simple_rnn = paddle.layer.recurrent( cost_sum, cost_counter = 0.0, 0
input=input_proj_bn, act=act, reverse=True) else:
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
cost_sum, cost_counter = 0.0, 0
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=dev_batch_reader, feeding=feeding_dict)
output_model_path = os.path.join(
output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
with gzip.open(output_model_path, 'w') as f:
self._parameters.to_tar(f)
print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" %
(time.time() - start_time, event.pass_id, result.cost))
# run train
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding_dict)
def conv_group(input, num_stacks): def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta,
""" beam_size, cutoff_prob, vocab_list, language_model_path,
Convolution group with several stacking convolution layers. num_processes):
""" # define inferer
conv = conv_bn_layer( if self._inferer == None:
input=input, self._inferer = paddle.inference.Inference(
filter_size=(11, 41), output_layer=self._log_probs, parameters=self._parameters)
num_channels_in=1, # run inference
num_channels_out=32, infer_results = self._inferer.infer(input=infer_data)
stride=(3, 2), num_steps = len(infer_results) // len(infer_data)
padding=(5, 20), probs_split = [
act=paddle.activation.BRelu()) infer_results[i * num_steps:(i + 1) * num_steps]
for i in xrange(num_stacks - 1): for i in xrange(0, len(infer_data))
conv = conv_bn_layer( ]
input=conv, # run decoder
filter_size=(11, 21), results = []
num_channels_in=32, if decode_method == "best_path":
num_channels_out=32, # best path decode
stride=(1, 2), for i, probs in enumerate(probs_split):
padding=(5, 10), output_transcription = ctc_best_path_decoder(
act=paddle.activation.BRelu()) probs_seq=probs, vocabulary=data_generator.vocab_list)
output_num_channels = 32 results.append(output_transcription)
output_height = 160 // pow(2, num_stacks) + 1 elif decode_method == "beam_search":
return conv, output_num_channels, output_height # initialize external scorer
if self._ext_scorer == None:
self._ext_scorer = LmScorer(beam_alpha, beam_beta,
language_model_path)
self._loaded_lm_path = language_model_path
else:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
assert self._loaded_lm_path == language_model_path
# beam search decode
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
blank_id=len(vocab_list),
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob)
results = [result[0][1] for result in beam_search_results]
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
return results
def rnn_group(input, size, num_stacks): def _create_parameters(self, model_path=None):
""" if model_path is None:
RNN group with several stacking RNN layers. self._parameters = paddle.parameters.create(self._loss)
""" else:
output = input self._parameters = paddle.parameters.Parameters.from_tar(
for i in xrange(num_stacks): gzip.open(model_path))
output = bidirectional_simple_rnn_bn_layer(
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
return output
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
def deep_speech2(audio_data, rnn_layer_size):
text_data, # paddle.data_type.dense_array is used for variable batch input.
dict_size, # The size 161 * 161 is only an placeholder value and the real shape
num_conv_layers=2, # of input batch data will be induced during training.
num_rnn_layers=3, audio_data = paddle.layer.data(
rnn_size=256, name="audio_spectrogram",
is_inference=False): type=paddle.data_type.dense_array(161 * 161))
""" text_data = paddle.layer.data(
The whole DeepSpeech2 model structure (a simplified version). name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size))
:param audio_data: Audio spectrogram data layer. self._log_probs, self._loss = deep_speech2(
:type audio_data: LayerOutput audio_data=audio_data,
:param text_data: Transcription text data layer. text_data=text_data,
:type text_data: LayerOutput dict_size=vocab_size,
:param dict_size: Dictionary size for tokenized transcription. num_conv_layers=num_conv_layers,
:type dict_size: int num_rnn_layers=num_rnn_layers,
:param num_conv_layers: Number of stacking convolution layers. rnn_size=rnn_layer_size)
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param is_inference: False in the training mode, and True in the
inferene mode.
:type is_inference: bool
:return: If is_inference set False, return a ctc cost layer;
if is_inference set True, return a sequence layer of output
probability distribution.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
num_channels=conv_group_num_channels,
stride_x=1,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
act=paddle.activation.Linear(),
bias_attr=True)
if is_inference:
# probability distribution with softmax
return paddle.layer.mixed(
input=paddle.layer.identity_projection(input=fc),
act=paddle.activation.Softmax())
else:
# ctc cost
return paddle.layer.warp_ctc(
input=fc,
label=text_data,
size=dict_size + 1,
blank=dict_size,
norm_by_times=True)
...@@ -3,15 +3,11 @@ from __future__ import absolute_import ...@@ -3,15 +3,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import sys
import os
import argparse import argparse
import gzip
import time
import distutils.util import distutils.util
import multiprocessing import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import deep_speech2 from model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
import utils import utils
...@@ -23,6 +19,12 @@ parser.add_argument( ...@@ -23,6 +19,12 @@ parser.add_argument(
default=200, default=200,
type=int, type=int,
help="Training pass number. (default: %(default)s)") help="Training pass number. (default: %(default)s)")
parser.add_argument(
"--num_iterations_print",
default=100,
type=int,
help="Number of iterations for every train cost printing. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_conv_layers", "--num_conv_layers",
default=2, default=2,
...@@ -127,100 +129,47 @@ args = parser.parse_args() ...@@ -127,100 +129,47 @@ args = parser.parse_args()
def train(): def train():
"""DeepSpeech2 training.""" """DeepSpeech2 training."""
train_generator = DataGenerator(
# initialize data generator vocab_filepath=args.vocab_filepath,
def data_generator(): mean_std_filepath=args.mean_std_filepath,
return DataGenerator( augmentation_config=args.augmentation_config,
vocab_filepath=args.vocab_filepath, max_duration=args.max_duration,
mean_std_filepath=args.mean_std_filepath, min_duration=args.min_duration,
augmentation_config=args.augmentation_config, specgram_type=args.specgram_type,
max_duration=args.max_duration, num_threads=args.num_threads_data)
min_duration=args.min_duration, dev_generator = DataGenerator(
specgram_type=args.specgram_type, vocab_filepath=args.vocab_filepath,
num_threads=args.num_threads_data) mean_std_filepath=args.mean_std_filepath,
augmentation_config="{}",
train_generator = data_generator() specgram_type=args.specgram_type,
test_generator = data_generator() num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(
train_generator.vocab_size))
cost = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=train_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=False)
# create/load parameters and optimizer
if args.init_model_path is None:
parameters = paddle.parameters.create(cost)
else:
if not os.path.isfile(args.init_model_path):
raise IOError("Invalid model!")
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.init_model_path))
optimizer = paddle.optimizer.Adam(
learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# prepare data reader
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path, manifest_path=args.train_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=args.trainer_count, min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False, sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method) shuffle_method=args.shuffle_method)
test_batch_reader = test_generator.batch_reader_creator( dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path, manifest_path=args.dev_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors. min_batch_size=1, # must be 1, but will have errors.
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# create event handler ds2_model = DeepSpeech2Model(
def event_handler(event): vocab_size=train_generator.vocab_size,
global start_time, cost_sum, cost_counter num_conv_layers=args.num_conv_layers,
if isinstance(event, paddle.event.EndIteration): num_rnn_layers=args.num_rnn_layers,
cost_sum += event.cost rnn_layer_size=args.rnn_layer_size,
cost_counter += 1 pretrained_model_path=args.init_model_path)
if (event.batch_id + 1) % 100 == 0: ds2_model.train(
print("\nPass: %d, Batch: %d, TrainCost: %f" % ( train_batch_reader=train_batch_reader,
event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) dev_batch_reader=dev_batch_reader,
cost_sum, cost_counter = 0.0, 0 feeding_dict=train_generator.feeding,
with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f: learning_rate=args.adam_learning_rate,
parameters.to_tar(f) gradient_clipping=400,
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
cost_sum, cost_counter = 0.0, 0
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=test_batch_reader, feeding=test_generator.feeding)
print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" %
(time.time() - start_time, event.pass_id, result.cost))
with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
parameters.to_tar(f)
# run train
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=args.num_passes, num_passes=args.num_passes,
feeding=train_generator.feeding) num_iterations_print=args.num_iterations_print)
def main(): def main():
......
...@@ -3,14 +3,13 @@ from __future__ import absolute_import ...@@ -3,14 +3,13 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np
import distutils.util import distutils.util
import argparse import argparse
import gzip import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils import utils
...@@ -40,6 +39,11 @@ parser.add_argument( ...@@ -40,6 +39,11 @@ parser.add_argument(
default=True, default=True,
type=distutils.util.strtobool, type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)") help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_threads_data", "--num_threads_data",
default=multiprocessing.cpu_count(), default=multiprocessing.cpu_count(),
...@@ -62,10 +66,10 @@ parser.add_argument( ...@@ -62,10 +66,10 @@ parser.add_argument(
type=str, type=str,
help="Manifest path for normalizer. (default: %(default)s)") help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--decode_manifest_path", "--tune_manifest_path",
default='datasets/manifest.test', default='datasets/manifest.test',
type=str, type=str,
help="Manifest path for decoding. (default: %(default)s)") help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--model_filepath", "--model_filepath",
default='checkpoints/params.latest.tar.gz', default='checkpoints/params.latest.tar.gz',
...@@ -127,96 +131,64 @@ args = parser.parse_args() ...@@ -127,96 +131,64 @@ args = parser.parse_args()
def tune(): def tune():
"""Tune parameters alpha and beta on one minibatch.""" """Tune parameters alpha and beta on one minibatch."""
if not args.num_alphas >= 0: if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!") raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0: if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!") raise ValueError("num_betas must be non-negative!")
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.tune_manifest_path,
batch_size=args.num_samples, batch_size=args.num_samples,
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# get one batch data for tuning tune_data = batch_reader().next()
infer_data = batch_reader().next() target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
# run inference for _, transcript in tune_data
infer_results = paddle.infer(
output_layer=output_probs, parameters=parameters, input=infer_data)
num_steps = len(infer_results) // len(infer_data)
probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps]
for i in xrange(0, len(infer_data))
] ]
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath)
# create grid for search # create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas] for beta in cand_betas]
ext_scorer = LmScorer(args.alpha_from, args.beta_from,
args.language_model_path)
## tune parameters in loop ## tune parameters in loop
for alpha, beta in params_grid: for alpha, beta in params_grid:
wer_sum, wer_counter = 0, 0 result_transcripts = ds2_model.infer_batch(
# reset scorer infer_data=tune_data,
ext_scorer.reset_params(alpha, beta) decode_method='beam_search',
# beam search using multiple processes beam_alpha=alpha,
beam_search_results = ctc_beam_search_decoder_batch( beam_beta=beta,
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
blank_id=len(data_generator.vocab_list), vocab_list=data_generator.vocab_list,
num_processes=args.num_processes_beam_search, language_model_path=args.language_model_path,
ext_scoring_func=ext_scorer, ) num_processes=args.num_processes_beam_search)
for i, beam_search_result in enumerate(beam_search_results): wer_sum, num_ins = 0.0, 0
target_transcription = ''.join([ for target, result in zip(target_transcripts, result_transcripts):
data_generator.vocab_list[index] for index in infer_data[i][1] wer_sum += wer(target, result)
]) num_ins += 1
wer_sum += wer(target_transcription, beam_search_result[0][1])
wer_counter += 1
print("alpha = %f\tbeta = %f\tWER = %f" % print("alpha = %f\tbeta = %f\tWER = %f" %
(alpha, beta, wer_sum / wer_counter)) (alpha, beta, wer_sum / num_ins))
def main(): def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=1) utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune() tune()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册