diff --git a/deep_speech_2/demo_server.py b/deep_speech_2/demo_server.py index e4093ab29d992305dc02062d8fd73e6b6daeb39c..b000e35e91c20ec925fe1cd52a3901ed7ee9519f 100644 --- a/deep_speech_2/demo_server.py +++ b/deep_speech_2/demo_server.py @@ -63,9 +63,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -205,7 +212,8 @@ def start_server(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler def file_to_transcript(filename): diff --git a/deep_speech_2/evaluate.py b/deep_speech_2/evaluate.py index 8ab5b94494c78e2b37c52a020e3aaaef30e3ae37..8dd169b6c2a41a1ad749324e6cba60bff98d951b 100644 --- a/deep_speech_2/evaluate.py +++ b/deep_speech_2/evaluate.py @@ -35,9 +35,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -148,7 +155,8 @@ def evaluate(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) error_rate_func = cer if args.error_rate_type == 'cer' else wer error_sum, num_ins = 0.0, 0 diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 6b77f3d727eef4fad9bf69ae95816305603570bd..0c52ffc831b3349dacc5453bc21dc9a13e6471c8 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -30,9 +30,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -149,7 +156,8 @@ def infer(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, decode_method=args.decode_method, diff --git a/deep_speech_2/layer.py b/deep_speech_2/layer.py index a91f694b8e92ff3e3b48e569ecc0a7751d26bee2..b7ac3c23e3c7bb91c9b2e616e7c42b7f87ca244f 100644 --- a/deep_speech_2/layer.py +++ b/deep_speech_2/layer.py @@ -39,7 +39,7 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, return paddle.layer.batch_norm(input=conv_layer, act=act) -def bidirectional_simple_rnn_bn_layer(name, input, size, act): +def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights): """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. @@ -51,24 +51,50 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): :type size: int :param act: Activation type. :type act: BaseActivation + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool :return: Bidirectional simple rnn layer. :rtype: LayerOutput """ - # input-hidden weights shared across bi-direcitonal rnn. - input_proj_forward = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - input_proj_backward = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn_forward = paddle.layer.batch_norm( - input=input_proj_forward, act=paddle.activation.Linear()) - input_proj_bn_backward = paddle.layer.batch_norm( - input=input_proj_backward, act=paddle.activation.Linear()) - # forward and backward in time - forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn_forward, act=act, reverse=False) - backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn_backward, act=act, reverse=True) + if share_weights: + # input-hidden weights shared between bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + + else: + input_proj_forward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_forward, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_backward, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) @@ -87,7 +113,6 @@ def bidirectional_gru_bn_layer(name, input, size, act): :return: Bidirectional simple rnn layer. :rtype: LayerOutput """ - # input-hidden weights shared across bi-direcitonal rnn. input_proj_forward = paddle.layer.fc( input=input, size=size * 3, @@ -98,7 +123,7 @@ def bidirectional_gru_bn_layer(name, input, size, act): size=size * 3, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection + # batch norm is only performed on input-related projections input_proj_bn_forward = paddle.layer.batch_norm( input=input_proj_forward, act=paddle.activation.Linear()) input_proj_bn_backward = paddle.layer.batch_norm( @@ -126,9 +151,9 @@ def conv_group(input, num_stacks): filter_size=(11, 41), num_channels_in=1, num_channels_out=32, - stride=(2, 2), + stride=(3, 2), padding=(5, 20), - act=paddle.activation.Relu()) + act=paddle.activation.BRelu()) for i in xrange(num_stacks - 1): conv = conv_bn_layer( input=conv, @@ -137,13 +162,13 @@ def conv_group(input, num_stacks): num_channels_out=32, stride=(1, 2), padding=(5, 10), - act=paddle.activation.Relu()) + act=paddle.activation.BRelu()) output_num_channels = 32 output_height = 160 // pow(2, num_stacks) + 1 return conv, output_num_channels, output_height -def rnn_group(input, size, num_stacks, use_gru): +def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): """RNN group with stacked bidirectional simple RNN layers. :param input: Input layer. @@ -154,6 +179,10 @@ def rnn_group(input, size, num_stacks, use_gru): :type num_stacks: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool :return: Output layer of the RNN group. :rtype: LayerOutput """ @@ -165,12 +194,14 @@ def rnn_group(input, size, num_stacks, use_gru): input=output, size=size, act=paddle.activation.Relu()) + # BRelu does not support hppl, need to add later. Use Relu instead. else: output = bidirectional_simple_rnn_bn_layer( name=str(i), input=output, size=size, - act=paddle.activation.Relu()) + act=paddle.activation.BRelu(), + share_weights=share_rnn_weights) return output @@ -180,9 +211,10 @@ def deep_speech2(audio_data, num_conv_layers=2, num_rnn_layers=3, rnn_size=256, - use_gru=True): + use_gru=False, + share_rnn_weights=True): """ - The whole DeepSpeech2 model structure (a simplified version). + The whole DeepSpeech2 model structure. :param audio_data: Audio spectrogram data layer. :type audio_data: LayerOutput @@ -198,6 +230,10 @@ def deep_speech2(audio_data, :type rnn_size: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput @@ -218,7 +254,8 @@ def deep_speech2(audio_data, input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers, - use_gru=use_gru) + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, diff --git a/deep_speech_2/model.py b/deep_speech_2/model.py index eec971c005f2ec7376bd23ed08002da7355af1c5..0234ed2d4c901f36ebfc16b317f5355cd57796e0 100644 --- a/deep_speech_2/model.py +++ b/deep_speech_2/model.py @@ -27,12 +27,17 @@ class DeepSpeech2Model(object): :param pretrained_model_path: Pretrained model path. If None, will train from stratch. :type pretrained_model_path: basestring|None + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs.Notice that + for GRU, weight sharing is not supported. + :type share_rnn_weights: bool """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru, pretrained_model_path): + rnn_layer_size, use_gru, pretrained_model_path, + share_rnn_weights): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru) + rnn_layer_size, use_gru, share_rnn_weights) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None @@ -226,7 +231,7 @@ class DeepSpeech2Model(object): gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru): + rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape @@ -244,4 +249,5 @@ class DeepSpeech2Model(object): num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_layer_size, - use_gru=use_gru) + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index 42870bf536e95f6622451b8906381f05db3ea4b6..d055341f10c82f3cec38867e2db36cfaaabe0a79 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -37,9 +37,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=1024, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -176,7 +183,8 @@ def train(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.init_model_path) + pretrained_model_path=args.init_model_path, + share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, diff --git a/deep_speech_2/tune.py b/deep_speech_2/tune.py index ffab8860bb2b36fd47f11fea0cf16e61e0af97e1..d8001339eef1f51bb221238a647b2c4857a790d2 100644 --- a/deep_speech_2/tune.py +++ b/deep_speech_2/tune.py @@ -31,9 +31,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -164,7 +171,8 @@ def tune(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) diff --git a/deep_speech_2/utils.py b/deep_speech_2/utils.py index 9ca363c8f59c2b1cd2885db4b04605c0025998bf..1d51e2042397b4d3010259a8a3174bc969968aec 100644 --- a/deep_speech_2/utils.py +++ b/deep_speech_2/utils.py @@ -10,12 +10,12 @@ def print_arguments(args): Usage: .. code-block:: python - + parser = argparse.ArgumentParser() parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() + args = parser.parse_args() print_arguments(args) - + :param args: Input argparse.Namespace for printing. :type args: argparse.Namespace """